修改 64 位变量的低字节,例如 rax/al 寄存器,无需编译器开销

发布于 2025-01-19 19:04:37 字数 8172 浏览 1 评论 0 原文

我有一段对性能至关重要的代码,其中实际上包含 uint8_t 变量(保证不会溢出),这些变量是从其他 uint8_t 值递增的,也用作数组索引和其他 64 位地址计算的一部分。

我查看了经过全面优化的 MSVC 编译器的反汇编,令人烦恼的是,无论我如何尝试,都有大量过多的 movzx 和其他不必要的附加指令用于在 8 位和 64 位操作之间进行转换。如果我使用 8 位变量,地址计算会通过临时寄存器执行额外的零扩展等,如果我使用 64 位变量,则对添加到其中的其他 8 位值进行类似的附加操作。

如果用汇编编写,就不会有问题,因为可以根据需要使用例如 rax 和 al 寄存器同时访问该值。是否有某种方法可以使用 C++ 访问 uint64_t 变量的低字节(特别是用于执行加法),以便 MSVC 足够聪明,可以在完整变量为时使用简单的直接 al 寄存器访问(ag add al, other_uint8_var)来编译它在 rax 寄存器中?

我尝试了几种替代方法,例如位掩码高/低部分以模拟低字节更改、使用 8 位和 64 位值的联合别名、使用临时 8 位引用变量别名 64 位值等。只会导致更糟糕的结果,通常会将变量从寄存器移动到临时内存位置以执行更改。


最简单的例子:

#include <stdint.h>
unsigned idx(unsigned *ptr, uint8_t *v)
{
    uint8_t tmp = v[1] + v[2];       // truncate the sum to 8-bit
    return ptr[tmp];                 // before using with a 64-bit pointer
}

所有编译器(Godbolt:GCC11/clang14/MSVC19.31/ICC19.01) 做得不好,浪费了 movzx eax,al href="https://stackoverflow.com/questions/44169342/can-x86s-mov-really-be-free-why-cant-i-reproduct-this-at-all/44193770#44193770">甚至不能受益于 mov-elimination 的零延迟,因为它们在同一寄存器内扩展。 MSVC 19.31 -O2 编译为:

unsigned int idx(unsigned int *,unsigned char *) PROC                                ; idx, COMDAT
        movzx   eax, BYTE PTR [rdx+2]
        add     al, BYTE PTR [rdx+1]
        movzx   eax, al                       ;; fully redundant, value already truncated to 8-bit and zero-extended to 64
        mov     eax, DWORD PTR [rcx+rax*4]
        ret     0

Clang/LLVM 实际上做得更糟,从 mov al, [mem] 加载开始,错误地依赖于 RAX 的旧值(在 P6 系列以外的 CPU 上)和第一代 Sandybridge)。但节省了一个字节的机器代码大小。


MSVC 编译的更多示例:

以下可运行程序生成 3 个随机整数,它们相加为数组索引。有趣的部分被放置在 test() 函数中,以强制编译器使用所需的参数类型并将该部分分开,以便于从其他内联代码中查看程序集。

#include <iostream>
#include <cstdlib>

static constexpr uint64_t array[30]{ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 };

struct Test {
    __declspec(noinline) static uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
        uint64_t index = base;
        index += added1;
        index += added2;
        return array[index];
    }
};

int main()
{
    uint64_t base = rand() % 10;
    uint8_t added1 = rand() % 10;
    uint8_t added2 = rand() % 10;

    uint64_t result = Test::test(base, added1, added2);

    std::cout << "array[" << base << "+" << (uint64_t)added1 << "+" << (uint64_t)added2 << "]=" << result << std::endl;
    return 0;
}

上述具有 uint64 基本索引的测试函数编译为:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    movzx       edx,dl  
    add         rcx,rdx  
    movzx       eax,r8b  
    add         rax,rcx  
    lea         rcx,[array (07FF74FD63340h)]  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

编译器已分配 rcx=base、dl=added1、r8b=added2。 uint8_t 值在求和之前单独进行零扩展。

将基本索引更改为 uint8_t 进行编译:

uint64_t test(uint8_t base, uint8_t added1, uint8_t added2) {
    uint8_t index = base;
    index += added1;
    index += added2;
    return array[index];
}

uint64_t test(uint8_t base, uint8_t added1, uint8_t added2) {
    add         cl,dl  
    add         cl,r8b  
    movzx       eax,cl  
    lea         rcx,[array (07FF6287C3340h)]  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

因此,现在编译器很乐意使用 8 位寄存器进行数学计算,但需要单独对结果进行零扩展以进行寻址。

我想要得到的基本上是上面没有 movzx 的内容,这样 rcx 将直接用作数组偏移量,因为我知道除了最低字节之外的所有字节都已经为零。显然编译器无法自动执行此操作,因为与我不同,它不知道添加不会导致溢出。所以我缺少的是一种讲述这一点的方法。

如果我尝试将目标寄存器转换为 8 位(这往往适用于读取操作)或使用联合来对 rcx/cl 寄存器等变量进行建模,它会通过堆栈变量来实现:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    uint64_t index = base;
    reinterpret_cast<uint8_t&>(index) += added1;
    reinterpret_cast<uint8_t&>(index) += added2;
    return array[index];
}

OR:

union Register {
    uint64_t u64;
    uint8_t u8;
};

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    Register index;
    index.u64 = base;
    index.u8 += added1;
    index.u8 += added2;
    return array[index.u64];
}

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    mov         qword ptr [rsp+8],rcx  
    add         cl,dl  
    add         cl,r8b  
    mov         byte ptr [index],cl  
    lea         rcx,[array (07FF798B53340h)]  
    mov         rax,qword ptr [index]  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

尝试通过某些方式表明我的意图位掩码编译为:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    uint64_t index = base;
    index = (index & 0xffffffffffffff00) | (uint8_t)(index + added1);
    index = (index & 0xffffffffffffff00) | (uint8_t)(index + added2);
    return array[index];
}

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    lea         eax,[rdx+rcx]  
    and         rcx,0FFFFFFFFFFFFFF00h  
    movzx       edx,al  
    or          rdx,rcx  
    lea         rcx,[array (07FF70F4B3340h)]  
    lea         eax,[r8+rdx]  
    and         rdx,0FFFFFFFFFFFFFF00h  
    movzx       eax,al  
    or          rax,rdx  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

我相信某些类似的模式适用于某些编译器,用于覆盖(而不是添加)低字节,但这里编译器显然无法识别该模式。另一个类似的模式会生成以下内容:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    uint64_t index = base;
    index = (index & 0xffffffffffffff00) | (((index & 0xff) + added1) & 0xff);
    index = (index & 0xffffffffffffff00) | (((index & 0xff) + added2) & 0xff);
    return array[index];
}

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    movzx       eax,dl  
    add         rax,rcx  
    xor         rax,rcx  
    movzx       edx,al  
    xor         rdx,rcx  
    movzx       eax,r8b  
    add         rax,rdx  
    lea         rcx,[array (07FF6859F3340h)]  
    xor         rax,rdx  
    movzx       eax,al  
    xor         rax,rdx  
    mov         rax,qword ptr [rcx+rax*8]  
    ret
}

I have a performance critical piece of code that contains in reality uint8_t variables (guaranteed not to overflow) that are incremented from other uint8_t values and also used as parts of array indexing and other 64-bit address calculations.

I have looked at the disassembly of MSVC compiler with full optimizations and annoyingly there's plenty of excessive movzx and other unnecessary additional instructions for converting between 8-bit and 64-bit operations, no matter how I try to do it. If I use a 8-bit variable, address calculations perform additional zero extensions etc. through temporary registers and if I use a 64-bit variable there are similar additional operations for the other 8-bit values that are added to it.

If that was written with assembly, there would be no problem, as the value could be accessed simultaneously with e.g. rax and al registers as needed. Is there some way to access the low byte (especially for performing additions) of a uint64_t variable with C++ so that MSVC would be smart enough to compile it with e.g. simple direct al register access (a.g. add al, other_uint8_var) when the full variable is in the rax register?

I tried several alternatives like bit masking high/low parts for emulating the low byte change, aliasing with a union of 8-bit and 64-bit values, aliasing the 64-bit value with temporary 8-bit reference variable etc. All of them just led to worse results, often so that the variable was moved from the register to temporary memory location for performing the change.


Simplest example:

#include <stdint.h>
unsigned idx(unsigned *ptr, uint8_t *v)
{
    uint8_t tmp = v[1] + v[2];       // truncate the sum to 8-bit
    return ptr[tmp];                 // before using with a 64-bit pointer
}

All compilers (Godbolt: GCC11/clang14/MSVC19.31/ICC19.01) do a bad job, wasting a movzx eax,al that can't even benefit from mov-elimination for zero latency because they widen within the same register. MSVC 19.31 -O2 compiles to:

unsigned int idx(unsigned int *,unsigned char *) PROC                                ; idx, COMDAT
        movzx   eax, BYTE PTR [rdx+2]
        add     al, BYTE PTR [rdx+1]
        movzx   eax, al                       ;; fully redundant, value already truncated to 8-bit and zero-extended to 64
        mov     eax, DWORD PTR [rcx+rax*4]
        ret     0

Clang/LLVM actually does even worse, starting with a mov al, [mem] load with a false dependency on the old value of RAX (on CPUs other than P6-family and first-generation Sandybridge). But is saves one byte of machine-code size.


Further examples how MSVC compiles:

The following runnable program generates 3 random integers which are summed as array index. Interesting part is placed in the test() function to force the compiler to use the desired argument types and keep the part separated for easy viewing of the assembly from the otherwise inlined code.

#include <iostream>
#include <cstdlib>

static constexpr uint64_t array[30]{ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 };

struct Test {
    __declspec(noinline) static uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
        uint64_t index = base;
        index += added1;
        index += added2;
        return array[index];
    }
};

int main()
{
    uint64_t base = rand() % 10;
    uint8_t added1 = rand() % 10;
    uint8_t added2 = rand() % 10;

    uint64_t result = Test::test(base, added1, added2);

    std::cout << "array[" << base << "+" << (uint64_t)added1 << "+" << (uint64_t)added2 << "]=" << result << std::endl;
    return 0;
}

The above test function with uint64 base index compiles to:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    movzx       edx,dl  
    add         rcx,rdx  
    movzx       eax,r8b  
    add         rax,rcx  
    lea         rcx,[array (07FF74FD63340h)]  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

Compiler has assigned rcx=base, dl=added1, r8b=added2.
uint8_t values are separately zero extended before summing.

Changing the base index to uint8_t compiles:

uint64_t test(uint8_t base, uint8_t added1, uint8_t added2) {
    uint8_t index = base;
    index += added1;
    index += added2;
    return array[index];
}

uint64_t test(uint8_t base, uint8_t added1, uint8_t added2) {
    add         cl,dl  
    add         cl,r8b  
    movzx       eax,cl  
    lea         rcx,[array (07FF6287C3340h)]  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

So now the compiler is happy to do the math with 8-bit registers but needs to separately zero extend the result for addressing.

What I would like to get is basically the above without the movzx, so that rcx would be used directly as the array offset, since I know all except the lowest byte are already zero. Obviously the compiler cannot do that automatically, since unlike me, it doesn't know the additions can't cause overflow. So what I'm missing is a way to tell it just that.

If I try to cast the target register as 8-bit (which tends to work for read operations) or use a union for modeling the variable like rcx/cl registers, it does it through a stack variable:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    uint64_t index = base;
    reinterpret_cast<uint8_t&>(index) += added1;
    reinterpret_cast<uint8_t&>(index) += added2;
    return array[index];
}

OR:

union Register {
    uint64_t u64;
    uint8_t u8;
};

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    Register index;
    index.u64 = base;
    index.u8 += added1;
    index.u8 += added2;
    return array[index.u64];
}

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    mov         qword ptr [rsp+8],rcx  
    add         cl,dl  
    add         cl,r8b  
    mov         byte ptr [index],cl  
    lea         rcx,[array (07FF798B53340h)]  
    mov         rax,qword ptr [index]  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

Attempting to indicate my intentions through some bit masking compiles to:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    uint64_t index = base;
    index = (index & 0xffffffffffffff00) | (uint8_t)(index + added1);
    index = (index & 0xffffffffffffff00) | (uint8_t)(index + added2);
    return array[index];
}

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    lea         eax,[rdx+rcx]  
    and         rcx,0FFFFFFFFFFFFFF00h  
    movzx       edx,al  
    or          rdx,rcx  
    lea         rcx,[array (07FF70F4B3340h)]  
    lea         eax,[r8+rdx]  
    and         rdx,0FFFFFFFFFFFFFF00h  
    movzx       eax,al  
    or          rax,rdx  
    mov         rax,qword ptr [rcx+rax*8]  
    ret  
}

I believe some similar pattern works on some compilers for overwriting (instead of adding to) the low byte, but here the compiler clearly failed to recognize that pattern. Another similar pattern generates this:

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    uint64_t index = base;
    index = (index & 0xffffffffffffff00) | (((index & 0xff) + added1) & 0xff);
    index = (index & 0xffffffffffffff00) | (((index & 0xff) + added2) & 0xff);
    return array[index];
}

uint64_t test(uint64_t base, uint8_t added1, uint8_t added2) {
    movzx       eax,dl  
    add         rax,rcx  
    xor         rax,rcx  
    movzx       edx,al  
    xor         rdx,rcx  
    movzx       eax,r8b  
    add         rax,rdx  
    lea         rcx,[array (07FF6859F3340h)]  
    xor         rax,rdx  
    movzx       eax,al  
    xor         rax,rdx  
    mov         rax,qword ptr [rcx+rax*8]  
    ret
}

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

痕至 2025-01-26 19:04:37

为什么不简单地编写 inline 汇编代码?对于非常关键的代码,可能是唯一的解决方案...我必须这样做才能从裸机平台上的compactflash读取,例如:C或C ++代码甚至没有能够匹配所需的时间...

此外,通过对此汇编代码进行基准测试,可以帮助您查看您是否真的获得了更多的性能 - ! - 也许给您一个目标和/或指示您期望达到的目标。

如果您需要可移植性,则仍然可以进行有条件的汇编,所有优化的 inline 汇编>汇编>用于所有已知目标,并留下所有其他情况的最佳C/C ++代码 - 不会是完美的,但是,并非所有CPU都有部分寄存器,因此通用的C代码可以为这些平台做到问题。

Why not simply write inline assembler code? For really highly critical code, it may be the only solution... I had to do that for reading from a CompactFlash on a bare-metal platform, for example: C or C++ code wasn't even able to match required timings...

Also, it may help you, by benchmarking this assembler code, to see if you really get more performances - or not! - and maybe give you a goal and/or an indication of what you can expect to reach.

If you need portability, you can still have conditional compilation, with all optimized inline assembler for all known targets, and leave the best C/C++ code you have for every other cases - won't be perfect, but not all CPU has partial registers anyway so a generic C code could do the trick for these platforms.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文