为什么要在 ASM 中指定变量的地址而不是仅仅将其复制到寄存器中？

发布于 2024-12-12 00:44:12 字数 1468 浏览 6 评论 0原文

在我学习汇编（在 x86_64 上使用 GCC）的过程中，我遇到了一些 SSE 示例，其中不是将 C 变量复制到寄存器中，而是将地址复制到 EAX 中。当您可以这样做时为什么要这样做：

typedef float v4sf __attribute__((vector_size(16)));

typedef union {
    v4sf v;
    float f[4];
} Vec4;

Vec4 vector.v = (v4sf){ 64.1,128.2,256.3,512.4 };
float blah = 2.2;

__asm__("movups %0, %%xmm0 \n\t"
    "movups %1, %%xmm1 \n\t"
    "shufps $0x00, %%xmm1, %%xmm1 \n\t"
    "mulps %%xmm1, %%xmm0 \n\t"
    "movups %%xmm0, %0 \n\t"
    : "+m"(vector)
    : "m"(blah)
    : "%xmm0","%xmm1"
);

将向量复制到 xmm0 （而不是将其保留在内存中）会导致性能下降吗？

这是我正在谈论的示例（它是英特尔语法）：

void powf_schlickSSE(const float * a, const float b, float * result){

    __asm {
        mov         eax, a              //load address of vector
        movss       xmm0, dword ptr [b] //load exponent into SSE register
        movups      xmm1, [eax]         //load vector into SSE register
        shufps      xmm0, xmm0, 0       //shuffle b into all floats
        movaps      xmm2, xmm1          //duplicate vector
        mov         eax, result         //load address of result
        mulps       xmm1, xmm0          //xmm1 = a*b
        subps       xmm0, xmm1          //xmm0 = b-a*b
        addps       xmm0, xmm2          //xmm2 = b-a*b+a
        rcpps       xmm0, xmm0          //xmm1 = 1 / (b-a*b+a)
        mulps       xmm2, xmm0          //xmm0 = a * (1 / (b-a*b+a))
        movups      [eax], xmm2         //store result
    }
}

原文

In my quest to learn assembly (using GCC on x86_64), I have come across some SSE examples where instead of just copying a C variable into a register, the address is copied in to EAX instead. Why do that when you can just do this:

typedef float v4sf __attribute__((vector_size(16)));

typedef union {
    v4sf v;
    float f[4];
} Vec4;

Vec4 vector.v = (v4sf){ 64.1,128.2,256.3,512.4 };
float blah = 2.2;

__asm__("movups %0, %%xmm0 \n\t"
    "movups %1, %%xmm1 \n\t"
    "shufps $0x00, %%xmm1, %%xmm1 \n\t"
    "mulps %%xmm1, %%xmm0 \n\t"
    "movups %%xmm0, %0 \n\t"
    : "+m"(vector)
    : "m"(blah)
    : "%xmm0","%xmm1"
);

Does copying the vector into xmm0 (rather than keeping it in memory) cause a performance hit?

Here is an example of what I'm talking about (it's Intel syntax):

void powf_schlickSSE(const float * a, const float b, float * result){

    __asm {
        mov         eax, a              //load address of vector
        movss       xmm0, dword ptr [b] //load exponent into SSE register
        movups      xmm1, [eax]         //load vector into SSE register
        shufps      xmm0, xmm0, 0       //shuffle b into all floats
        movaps      xmm2, xmm1          //duplicate vector
        mov         eax, result         //load address of result
        mulps       xmm1, xmm0          //xmm1 = a*b
        subps       xmm0, xmm1          //xmm0 = b-a*b
        addps       xmm0, xmm2          //xmm2 = b-a*b+a
        rcpps       xmm0, xmm0          //xmm1 = 1 / (b-a*b+a)
        mulps       xmm2, xmm0          //xmm0 = a * (1 / (b-a*b+a))
        movups      [eax], xmm2         //store result
    }
}

分享到QQ

分享到微博