我的编译器在做什么? (优化memcpy)
我正在使用 VC++2010 中的以下设置编译一些代码: /O2 /Ob2 /Oi /Ot
但是,我在理解生成的程序集的某些部分时遇到了一些问题,我在代码中提出了一些问题:评论。
另外,现代 cpu 上通常建议的预取距离是多少?我可以在自己的 cpu 上进行 ofc 测试,但我希望得到一些能够在更广泛的 cpu 上正常工作的值。也许可以使用动态预取距离?
<--编辑:
另一件事让我感到惊讶的是编译器不会以某种形式交错 movdqa 和 movntdq 指令?因为根据我的理解,这些指令在某种意义上是异步的。
此代码还假设预取时有 32 字节缓存行,但高端 cpu 似乎有 64 字节缓存行,因此可以删除其中 2 个预取。
-->
void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{
0052AC20 push ebp
0052AC21 mov ebp,esp
const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);
for(size_t n = 0; n < size/16; n += 8)
0052AC23 mov edx,dword ptr [size]
0052AC26 mov ecx,dword ptr [dest]
0052AC29 mov eax,dword ptr [source]
0052AC2C shr edx,4
0052AC2F test edx,edx
0052AC31 je copy+9Eh (52ACBEh)
__m128i xmm0 = _mm_setzero_si128();
__m128i xmm1 = _mm_setzero_si128();
__m128i xmm2 = _mm_setzero_si128();
__m128i xmm3 = _mm_setzero_si128();
__m128i xmm4 = _mm_setzero_si128();
__m128i xmm5 = _mm_setzero_si128();
__m128i xmm6 = _mm_setzero_si128();
__m128i xmm7 = _mm_setzero_si128();
__m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
0052AC37 push esi
0052AC38 push edi
0052AC39 lea edi,[edx-1]
0052AC3C shr edi,3
0052AC3F inc edi
{
_mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);
xmm0 = _mm_load_si128(source_128++);
xmm1 = _mm_load_si128(source_128++);
xmm2 = _mm_load_si128(source_128++);
xmm3 = _mm_load_si128(source_128++);
xmm4 = _mm_load_si128(source_128++);
xmm5 = _mm_load_si128(source_128++);
xmm6 = _mm_load_si128(source_128++);
xmm7 = _mm_load_si128(source_128++);
0052AC40 movdqa xmm6,xmmword ptr [eax+70h] // 1. Why is this moved before the pretecthes?
0052AC45 prefetchnta [eax+80h]
0052AC4C prefetchnta [eax+0A0h]
0052AC53 prefetchnta [eax+0C0h]
0052AC5A prefetchnta [eax+0E0h]
0052AC61 movdqa xmm0,xmmword ptr [eax+10h]
0052AC66 movdqa xmm1,xmmword ptr [eax+20h]
0052AC6B movdqa xmm2,xmmword ptr [eax+30h]
0052AC70 movdqa xmm3,xmmword ptr [eax+40h]
0052AC75 movdqa xmm4,xmmword ptr [eax+50h]
0052AC7A movdqa xmm5,xmmword ptr [eax+60h]
0052AC7F lea esi,[eax+70h] // 2. What is happening in these 2 lines?
0052AC82 mov edx,eax //
0052AC84 movdqa xmm7,xmmword ptr [edx] // 3. Why edx? and not simply eax?
_mm_stream_si128(dest_128++, xmm0);
0052AC88 mov esi,ecx // 4. Is esi never used?
0052AC8A movntdq xmmword ptr [esi],xmm7
_mm_stream_si128(dest_128++, xmm1);
0052AC8E movntdq xmmword ptr [ecx+10h],xmm0
_mm_stream_si128(dest_128++, xmm2);
0052AC93 movntdq xmmword ptr [ecx+20h],xmm1
_mm_stream_si128(dest_128++, xmm3);
0052AC98 movntdq xmmword ptr [ecx+30h],xmm2
_mm_stream_si128(dest_128++, xmm4);
0052AC9D movntdq xmmword ptr [ecx+40h],xmm3
_mm_stream_si128(dest_128++, xmm5);
0052ACA2 movntdq xmmword ptr [ecx+50h],xmm4
_mm_stream_si128(dest_128++, xmm6);
0052ACA7 movntdq xmmword ptr [ecx+60h],xmm5
_mm_stream_si128(dest_128++, xmm7);
0052ACAC lea edx,[ecx+70h]
0052ACAF sub eax,0FFFFFF80h
0052ACB2 sub ecx,0FFFFFF80h
0052ACB5 dec edi
0052ACB6 movntdq xmmword ptr [edx],xmm6 // 5. Why not simply ecx?
0052ACBA jne copy+20h (52AC40h)
0052ACBC pop edi
0052ACBD pop esi
}
}
原始代码:
void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{
assert(dest != nullptr);
assert(source != nullptr);
assert(source != dest);
assert(size % 128 == 0);
__m128i xmm0 = _mm_setzero_si128();
__m128i xmm1 = _mm_setzero_si128();
__m128i xmm2 = _mm_setzero_si128();
__m128i xmm3 = _mm_setzero_si128();
__m128i xmm4 = _mm_setzero_si128();
__m128i xmm5 = _mm_setzero_si128();
__m128i xmm6 = _mm_setzero_si128();
__m128i xmm7 = _mm_setzero_si128();
__m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);
for(size_t n = 0; n < size/16; n += 8)
{
_mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);
xmm0 = _mm_load_si128(source_128++);
xmm1 = _mm_load_si128(source_128++);
xmm2 = _mm_load_si128(source_128++);
xmm3 = _mm_load_si128(source_128++);
xmm4 = _mm_load_si128(source_128++);
xmm5 = _mm_load_si128(source_128++);
xmm6 = _mm_load_si128(source_128++);
xmm7 = _mm_load_si128(source_128++);
_mm_stream_si128(dest_128++, xmm0);
_mm_stream_si128(dest_128++, xmm1);
_mm_stream_si128(dest_128++, xmm2);
_mm_stream_si128(dest_128++, xmm3);
_mm_stream_si128(dest_128++, xmm4);
_mm_stream_si128(dest_128++, xmm5);
_mm_stream_si128(dest_128++, xmm6);
_mm_stream_si128(dest_128++, xmm7);
}
}
I'm compiling a bit of code using the following settings in VC++2010: /O2 /Ob2 /Oi /Ot
However I'm having some trouble understanding some parts of the assembly generated, I have put some questions in the code as comments.
Also, what prefetching distance is generally recommended on modern cpus? I can ofc test on my own cpu, but I was hoping for some value that will work well on a wider range of cpus. Maybe one could use dynamic prefetching distances?
<--EDIT:
Another thing I'm surprised about is that the compiler does not interleave in some form the movdqa and movntdq instructions? Since these instructions are in some sense asynchronous from my understanding.
This code also assumes 32 byte cache lines when prefetching, however it seems that high-end cpus have 64 byte cachelines, so 2 of the prefetches can probably be removed.
-->
void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{
0052AC20 push ebp
0052AC21 mov ebp,esp
const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);
for(size_t n = 0; n < size/16; n += 8)
0052AC23 mov edx,dword ptr [size]
0052AC26 mov ecx,dword ptr [dest]
0052AC29 mov eax,dword ptr [source]
0052AC2C shr edx,4
0052AC2F test edx,edx
0052AC31 je copy+9Eh (52ACBEh)
__m128i xmm0 = _mm_setzero_si128();
__m128i xmm1 = _mm_setzero_si128();
__m128i xmm2 = _mm_setzero_si128();
__m128i xmm3 = _mm_setzero_si128();
__m128i xmm4 = _mm_setzero_si128();
__m128i xmm5 = _mm_setzero_si128();
__m128i xmm6 = _mm_setzero_si128();
__m128i xmm7 = _mm_setzero_si128();
__m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
0052AC37 push esi
0052AC38 push edi
0052AC39 lea edi,[edx-1]
0052AC3C shr edi,3
0052AC3F inc edi
{
_mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);
xmm0 = _mm_load_si128(source_128++);
xmm1 = _mm_load_si128(source_128++);
xmm2 = _mm_load_si128(source_128++);
xmm3 = _mm_load_si128(source_128++);
xmm4 = _mm_load_si128(source_128++);
xmm5 = _mm_load_si128(source_128++);
xmm6 = _mm_load_si128(source_128++);
xmm7 = _mm_load_si128(source_128++);
0052AC40 movdqa xmm6,xmmword ptr [eax+70h] // 1. Why is this moved before the pretecthes?
0052AC45 prefetchnta [eax+80h]
0052AC4C prefetchnta [eax+0A0h]
0052AC53 prefetchnta [eax+0C0h]
0052AC5A prefetchnta [eax+0E0h]
0052AC61 movdqa xmm0,xmmword ptr [eax+10h]
0052AC66 movdqa xmm1,xmmword ptr [eax+20h]
0052AC6B movdqa xmm2,xmmword ptr [eax+30h]
0052AC70 movdqa xmm3,xmmword ptr [eax+40h]
0052AC75 movdqa xmm4,xmmword ptr [eax+50h]
0052AC7A movdqa xmm5,xmmword ptr [eax+60h]
0052AC7F lea esi,[eax+70h] // 2. What is happening in these 2 lines?
0052AC82 mov edx,eax //
0052AC84 movdqa xmm7,xmmword ptr [edx] // 3. Why edx? and not simply eax?
_mm_stream_si128(dest_128++, xmm0);
0052AC88 mov esi,ecx // 4. Is esi never used?
0052AC8A movntdq xmmword ptr [esi],xmm7
_mm_stream_si128(dest_128++, xmm1);
0052AC8E movntdq xmmword ptr [ecx+10h],xmm0
_mm_stream_si128(dest_128++, xmm2);
0052AC93 movntdq xmmword ptr [ecx+20h],xmm1
_mm_stream_si128(dest_128++, xmm3);
0052AC98 movntdq xmmword ptr [ecx+30h],xmm2
_mm_stream_si128(dest_128++, xmm4);
0052AC9D movntdq xmmword ptr [ecx+40h],xmm3
_mm_stream_si128(dest_128++, xmm5);
0052ACA2 movntdq xmmword ptr [ecx+50h],xmm4
_mm_stream_si128(dest_128++, xmm6);
0052ACA7 movntdq xmmword ptr [ecx+60h],xmm5
_mm_stream_si128(dest_128++, xmm7);
0052ACAC lea edx,[ecx+70h]
0052ACAF sub eax,0FFFFFF80h
0052ACB2 sub ecx,0FFFFFF80h
0052ACB5 dec edi
0052ACB6 movntdq xmmword ptr [edx],xmm6 // 5. Why not simply ecx?
0052ACBA jne copy+20h (52AC40h)
0052ACBC pop edi
0052ACBD pop esi
}
}
original code:
void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{
assert(dest != nullptr);
assert(source != nullptr);
assert(source != dest);
assert(size % 128 == 0);
__m128i xmm0 = _mm_setzero_si128();
__m128i xmm1 = _mm_setzero_si128();
__m128i xmm2 = _mm_setzero_si128();
__m128i xmm3 = _mm_setzero_si128();
__m128i xmm4 = _mm_setzero_si128();
__m128i xmm5 = _mm_setzero_si128();
__m128i xmm6 = _mm_setzero_si128();
__m128i xmm7 = _mm_setzero_si128();
__m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);
for(size_t n = 0; n < size/16; n += 8)
{
_mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
_mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);
xmm0 = _mm_load_si128(source_128++);
xmm1 = _mm_load_si128(source_128++);
xmm2 = _mm_load_si128(source_128++);
xmm3 = _mm_load_si128(source_128++);
xmm4 = _mm_load_si128(source_128++);
xmm5 = _mm_load_si128(source_128++);
xmm6 = _mm_load_si128(source_128++);
xmm7 = _mm_load_si128(source_128++);
_mm_stream_si128(dest_128++, xmm0);
_mm_stream_si128(dest_128++, xmm1);
_mm_stream_si128(dest_128++, xmm2);
_mm_stream_si128(dest_128++, xmm3);
_mm_stream_si128(dest_128++, xmm4);
_mm_stream_si128(dest_128++, xmm5);
_mm_stream_si128(dest_128++, xmm6);
_mm_stream_si128(dest_128++, xmm7);
}
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(3)
eax+70h 读取会向上移动,因为 eax+70h 与 eax 位于不同的缓存行中,并且编译器可能希望硬件预取器尽快忙于获取该行。
它不进行交错,要么是因为它希望通过避免加载到存储的依赖关系来最大化性能(即使 AMD 优化指南明确指出要交错),要么只是因为它不确定存储不会覆盖加载。如果将 __restrict 关键字添加到源和目标,它会改变行为吗?
其余部分的目的也让我无法理解。对于 AMD 或 Intel 来说,可能是一些模糊的指令解码或硬件预取器考虑因素,但我找不到任何理由。我想知道当你删除这些指令时代码会变得更快还是更慢?
建议的预取距离取决于循环大小。需要足够远,以便数据在需要时有时间从内存到达。我认为你通常需要给它至少 100 个时钟周期。
eax+70h read is moved up because eax+70h is in a different cache line from eax, and the compiler probably wants the hardware prefetcher to get busy getting that line as soon as possible.
It does not interleave either because it wants to maximize performance by avoiding load-to-store dependencies (even though the AMD optimization guide explicitly says to interleave), or simply because it is not sure that stores won't overwrite loads. Does it change the behavior if you add __restrict keywords to source and dest?
The purpose of the rest of it eludes me too. Could be some obscure instruction decoding or hardware prefetcher considerations, either for AMD or Intel, but I can't find any justification for that. I wonder if the code gets faster or slower when you remove those instructions?
The recommended prefetching distance depends on the loop size. Needs to be far enough that the data has time to arrive from the memory by the time it's needed. I think that you usually need to give it at least 100 clock ticks.
我还没有弄清楚编译器的作用,但我想分享一些我的测试结果。我已经用汇编重写了该函数。
系统:Xeon W3520
4.55 GB/s:常规 memcpy
5.52 GB/s:有问题的 memcpy
5.58 GB/s:memcpy 低于
7.48 GB/s:memcpy 低于多线程
I haven't figured out what the compiler does, however I though I'd share some of my testing results. I've rewritten the function in assembly.
System: Xeon W3520
4.55 GB/s : regular memcpy
5.52 GB/s : memcpy in question
5.58 GB/s : memcpy below
7.48 GB/s : memcpy below multithreaded
因为它希望适当地分割数据路径,以便该指令
可以并行执行。
点号 4 可能是预取器的提示......可能(因为否则它没有任何意义,也可能是编译器/优化器错误/怪癖)。
我对5一点没有任何想法
because it wants propably to split the datapath so this instruction
can be executed in parallel.
Point number 4 could be a hint for the prefetcher...propably (because else it doesn't make any sense, could also be a compiler/optimizer bug/quirk).
I don't have any idea about point 5