SSE指令：哪些CPU可以进行原子16B内存操作？

发布于 2024-12-08 03:50:27 字数 3104 浏览 4 评论 0原文

考虑 x86 CPU 上的单个内存访问（单个读取或单个写入，而不是读+写）SSE 指令。该指令正在访问 16 字节（128 位）内存，并且访问的内存位置与 16 字节对齐。

文档“英特尔® 64 架构内存订购白皮书”指出，对于“读取或写入地址在 8 字节边界上对齐的四字（8 字节）的指令”，内存操作似乎作为单个内存访问执行，无论内存类型。

问题：是否存在 Intel/AMD/etc x86 CPU 可以保证读取或写入与 16 字节边界对齐的 16 字节（128 位）作为单个内存访问执行？ 是的，哪个特定CPU 类型是（Core2/Atom/K8/Phenom/...）？如果您对此问题提供答案（是/否），还请指定用于确定答案的方法 - PDF 文档查找、强力测试、数学证明或您选择的任何其他方法用于确定答案。

此问题涉及诸如 http://research.swtch.com/ 等问题2010/02/off-to-races.html

更新：

我用 C 创建了一个简单的测试程序，您可以在计算机上运行它。请在您的 Phenom、Athlon、Bobcat、Core2、Atom、Sandy Bridge 或您碰巧拥有的任何支持 SSE2 的 CPU 上编译并运行它。谢谢。

// Compile with:
//   gcc -o a a.c -pthread -msse2 -std=c99 -Wall -O2
//
// Make sure you have at least two physical CPU cores or hyper-threading.

#include <pthread.h>
#include <emmintrin.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>

typedef int v4si __attribute__ ((vector_size (16)));
volatile v4si x;

unsigned n1[16] __attribute__((aligned(64)));
unsigned n2[16] __attribute__((aligned(64)));

void* thread1(void *arg) {
        for (int i=0; i<100*1000*1000; i++) {
                int mask = _mm_movemask_ps((__m128)x);
                n1[mask]++;

                x = (v4si){0,0,0,0};
        }
        return NULL;
}

void* thread2(void *arg) {
        for (int i=0; i<100*1000*1000; i++) {
                int mask = _mm_movemask_ps((__m128)x);
                n2[mask]++;

                x = (v4si){-1,-1,-1,-1};
        }
        return NULL;
}

int main() {
        // Check memory alignment
        if ( (((uintptr_t)&x) & 0x0f) != 0 )
                abort();

        memset(n1, 0, sizeof(n1));
        memset(n2, 0, sizeof(n2));

        pthread_t t1, t2;
        pthread_create(&t1, NULL, thread1, NULL);
        pthread_create(&t2, NULL, thread2, NULL);
        pthread_join(t1, NULL);
        pthread_join(t2, NULL);

        for (unsigned i=0; i<16; i++) {
                for (int j=3; j>=0; j--)
                        printf("%d", (i>>j)&1);

                printf("  %10u %10u", n1[i], n2[i]);
                if(i>0 && i<0x0f) {
                        if(n1[i] || n2[i])
                                printf("  Not a single memory access!");
                }

                printf("\n");
        }

        return 0;
}

我笔记本上的 CPU 是 Core Duo（不是 Core2）。该特定 CPU 未通过测试，它以 8 字节的粒度实现 16 字节内存读/写。输出是：

0000    96905702      10512
0001           0          0
0010           0          0
0011          22      12924  Not a single memory access!
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100     3092557       1175  Not a single memory access!
1101           0          0
1110           0          0
1111        1719   99975389

原文

Consider a single memory access (a single read or a single write, not read+write) SSE instruction on an x86 CPU. The instruction is accessing 16 bytes (128 bits) of memory and the accessed memory location is aligned to 16 bytes.

The document "Intel® 64 Architecture Memory Ordering White Paper" states that for "Instructions that read or write a quadword (8 bytes) whose address is aligned on an 8 byte boundary" the memory operation appears to execute as a single memory access regardless of memory type.

The question: Do there exist Intel/AMD/etc x86 CPUs which guarantee that reading or writing 16 bytes (128 bits) aligned to a 16 byte boundary executes as a single memory access? Is so, which particular type of CPU is it (Core2/Atom/K8/Phenom/...)? If you provide an answer (yes/no) to this question, please also specify the method that was used to determine the answer - PDF document lookup, brute force testing, math proof, or whatever other method you used to determine the answer.

This question relates to problems such as http://research.swtch.com/2010/02/off-to-races.html

Update:

I created a simple test program in C that you can run on your computers. Please compile and run it on your Phenom, Athlon, Bobcat, Core2, Atom, Sandy Bridge or whatever SSE2-capable CPU you happen to have. Thanks.

// Compile with:
//   gcc -o a a.c -pthread -msse2 -std=c99 -Wall -O2
//
// Make sure you have at least two physical CPU cores or hyper-threading.

#include <pthread.h>
#include <emmintrin.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>

typedef int v4si __attribute__ ((vector_size (16)));
volatile v4si x;

unsigned n1[16] __attribute__((aligned(64)));
unsigned n2[16] __attribute__((aligned(64)));

void* thread1(void *arg) {
        for (int i=0; i<100*1000*1000; i++) {
                int mask = _mm_movemask_ps((__m128)x);
                n1[mask]++;

                x = (v4si){0,0,0,0};
        }
        return NULL;
}

void* thread2(void *arg) {
        for (int i=0; i<100*1000*1000; i++) {
                int mask = _mm_movemask_ps((__m128)x);
                n2[mask]++;

                x = (v4si){-1,-1,-1,-1};
        }
        return NULL;
}

int main() {
        // Check memory alignment
        if ( (((uintptr_t)&x) & 0x0f) != 0 )
                abort();

        memset(n1, 0, sizeof(n1));
        memset(n2, 0, sizeof(n2));

        pthread_t t1, t2;
        pthread_create(&t1, NULL, thread1, NULL);
        pthread_create(&t2, NULL, thread2, NULL);
        pthread_join(t1, NULL);
        pthread_join(t2, NULL);

        for (unsigned i=0; i<16; i++) {
                for (int j=3; j>=0; j--)
                        printf("%d", (i>>j)&1);

                printf("  %10u %10u", n1[i], n2[i]);
                if(i>0 && i<0x0f) {
                        if(n1[i] || n2[i])
                                printf("  Not a single memory access!");
                }

                printf("\n");
        }

        return 0;
}

The CPU I have in my notebook is Core Duo (not Core2). This particular CPU fails the test, it implements 16-byte memory read/writes with a granularity of 8 bytes. The output is:

0000    96905702      10512
0001           0          0
0010           0          0
0011          22      12924  Not a single memory access!
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100     3092557       1175  Not a single memory access!
1101           0          0
1110           0          0
1111        1719   99975389

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

一袭水袖舞倾城 2024-12-15 03:50:27

在英特尔® 64 和 IA-32 架构开发人员手册：卷。 3A，现在包含了你提到的内存订购白皮书的规格，在第8.1.1节中说：

Intel486 处理器（以及此后更新的处理器）保证
将始终执行以下基本内存操作
原子地：
读取或写入一个字节。
读取或写入在 16 位边界上对齐的字。
读取或写入在 32 位边界上对齐的双字。奔腾处理器（以及此后更新的处理器）保证
始终会执行以下附加内存操作
原子地：
读取或写入在 64 位边界上对齐的四字。
对适合 32 位数据总线的未缓存内存位置进行 16 位访问。
P6 系列处理器（以及此后更新的处理器）保证
总是会执行以下额外的内存操作
原子地：
对适合缓存行的缓存内存进行未对齐的 16 位、32 位和 64 位访问。
枚举对英特尔® AVX 支持的处理器（通过设置
功能标志 CPUID.01H:ECX.AVX[位 28]) 保证 16 字节
由以下指令执行的内存操作将始终
以原子方式执行：
MOVAPD、MOVAPS 和 MOVDQA。
使用 VEX.128 编码时的 VMOVAPD、VMOVAPS 和 VMOVDQA。
使用 EVEX.128 和 k0 编码时的 VMOVAPD、VMOVAPS、VMOVDQA32 和 VMOVDQA64（禁用屏蔽）。
（请注意，这些指令要求其内存操作数的线性地址是 16 字节对齐的。）

每个写入 x = (v4si){0,0,0,0} 和 x = (v4si){-1,-1,-1,-1} 可能被编译成单个 16 字节的 MOVAPS。 x 的地址是 16 字节对齐的。在支持 AVX 的 Intel 处理器上，这些写入是原子的。否则，它们就不是原子的。

在 AMD 处理器上， AMD64 架构程序员手册，第 7.3.2 节访问原子性指出

可缓存、自然对齐的单个加载或最多四字的存储在任何处理器上都是原子的
模型，以及完全包含在一个四字中的未对齐加载或存储
自然对齐的四字。未对齐的加载或存储访问通常会导致较小的延迟损失。
就延迟损失而言，该四字原子性边界的特定于模型的放宽，
可以在给定处理器的软件优化指南中找到。
未对齐的访问可能会受到来自其他处理器或缓存一致性的交错访问的影响
可能导致意外行为的设备。可以实现未对齐访问的原子性
必要时使用 XCHG 指令或任何合适的 LOCK 前缀指令。
报告 CPUID Fn0000_0001_ECX[AVX](位 28) = 1 的处理器将原子性扩展为
可缓存、自然对齐的单次加载或从四字到双四字的存储。

也就是说，与 Intel 类似，AMD 处理器确实保证对于支持 AVX 指令的处理器，16 字节加载和存储指令提供 16 字节原子性。

在不支持 AVX 的 Intel 和 AMD 处理器上，可以使用带有 LOCK 前缀的 CMPXCHG16B 指令。您可以使用 CPUID 指令来确定您的处理器是否支持 CMPXCHG16B（“CX16”功能位）。

编辑：测试程序结果

（修改测试程序以将 #iterations 增加 10 倍）

在 Xeon X3450 (x86-64) 上：

0000   999998139       1572
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          0
1101           0          0
1110           0          0
1111        1861  999998428

在 Xeon 5150（32 位）上：

0000   999243100     283087
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          0
1101           0          0
1110           0          0
1111      756900  999716913

在 Opteron 2435 上(x86-64)：

0000   999995893       1901
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          0
1101           0          0
1110           0          0
1111        4107  999998099

请注意，Intel Xeon X3450 和 Xeon 5150 不支持 AVX。 Opteron 2435 是 AMD 处理器（K10“Istanbul”），也不支持 AVX。

这是否意味着 Intel 和/或 AMD 保证 16 字节内存访问在这些机器上是原子的？恕我直言，事实并非如此。它在文档中没有作为有保证的架构行为，因此人们无法知道在这些特定处理器上 16 字节内存访问是否确实是原子的，或者测试程序是否只是由于某种原因未能触发它们。因此依赖它是危险的。

编辑2：如何让测试程序失败

哈！我设法使测试程序失败。在与上面相同的 Opteron 2435 上，使用相同的二进制文件，但现在通过“numactl”工具运行它，指定每个线程在单独的套接字上运行，我得到：

0000   999998634       5990
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          1  Not a single memory access!
1101           0          0
1110           0          0
1111        1366  999994009

那么这意味着什么？好吧，Opteron 2435 可能会也可能不会保证 16 字节内存访问对于套接字内访问来说是原子的，但至少在两个套接字之间的 HyperTransport 互连上运行的缓存一致性协议不提供这样的保证。

编辑 3：线程函数的 ASM，应“GJ”的请求。

这是为 Opteron 2435 系统上使用的 GCC 4.4 x86-64 版本生成的线程函数的 asm：


.globl thread2
        .type   thread2, @function
thread2:
.LFB537:
        .cfi_startproc
        movdqa  .LC3(%rip), %xmm1
        xorl    %eax, %eax
        .p2align 5,,24
        .p2align 3
.L11:
        movaps  x(%rip), %xmm0
        incl    %eax
        movaps  %xmm1, x(%rip)
        movmskps        %xmm0, %edx
        movslq  %edx, %rdx
        incl    n2(,%rdx,4)
        cmpl    $1000000000, %eax
        jne     .L11
        xorl    %eax, %eax
        ret
        .cfi_endproc
.LFE537:
        .size   thread2, .-thread2
        .p2align 5,,31
.globl thread1
        .type   thread1, @function
thread1:
.LFB536:
        .cfi_startproc
        pxor    %xmm1, %xmm1
        xorl    %eax, %eax
        .p2align 5,,24
        .p2align 3
.L15:
        movaps  x(%rip), %xmm0
        incl    %eax
        movaps  %xmm1, x(%rip)
        movmskps        %xmm0, %edx
        movslq  %edx, %rdx
        incl    n1(,%rdx,4)
        cmpl    $1000000000, %eax
        jne     .L15
        xorl    %eax, %eax
        ret
        .cfi_endproc

为了完整性， .LC3 是包含线程 2 使用的 (-1, -1, -1, -1) 向量的静态数据：


.LC3:
        .long   -1
        .long   -1
        .long   -1
        .long   -1
        .ident  "GCC: (GNU) 4.4.4 20100726 (Red Hat 4.4.4-13)"
        .section        .note.GNU-stack,"",@progbits

另请注意，这是 AT&T ASM 语法，而不是 Intel Windows 程序员可能更熟悉语法。最后，这是与 March=native 的关系，这使得 GCC 更喜欢 MOVAPS；但没关系，如果我使用 March=core2 它将使用 MOVDQA 来存储到 x，并且我仍然可以重现失败。

In the Intel® 64 and IA-32 Architectures Developer's Manual: Vol. 3A, which nowadays contains the specifications of the memory ordering white paper you mention, it is said in section 8.1.1 that:

The Intel486 processor (and newer processors since) guarantees that
the following basic memory operations will always be carried out
atomically:
Reading or writing a byte.
Reading or writing a word aligned on a 16-bit boundary.
Reading or writing a doubleword aligned on a 32-bit boundary. The Pentium processor (and newer processors since) guarantees that the
following additional memory operations will always be carried out
atomically:
Reading or writing a quadword aligned on a 64-bit boundary.
16-bit accesses to uncached memory locations that fit within a 32-bit data bus.
The P6 family processors (and newer processors since) guarantee that
the following additional memory operation will always be carried out
atomically:
Unaligned 16-, 32-, and 64-bit accesses to cached memory that fit within a cache line.
Processors that enumerate support for Intel® AVX (by setting the
feature flag CPUID.01H:ECX.AVX[bit 28]) guarantee that the 16-byte
memory operations performed by the following instructions will always
be carried out atomically:
MOVAPD, MOVAPS, and MOVDQA.
VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with EVEX.128 and k0 (masking disabled).
(Note that these instructions require the linear addresses of their memory operands to be 16-byte aligned.)

Each of the writes x = (v4si){0,0,0,0} and x = (v4si){-1,-1,-1,-1} are probably compiled into a single 16-byte MOVAPS. The address of x is 16-byte aligned. On an Intel processor that supports AVX, these writes are atomic. Otherwise, they are not atomic.

On AMD processors, AMD64 Architecture Programmer's Manual, Section 7.3.2 Access Atomicity states that

Cacheable, naturally-aligned single loads or stores of up to a quadword are atomic on any processor
model, as are misaligned loads or stores of less than a quadword that are contained entirely within a
naturally-aligned quadword. Misaligned load or store accesses typically incur a small latency penalty.
Model-specific relaxations of this quadword atomicity boundary, with respect to this latency penalty,
may be found in a given processor's Software Optimization Guide.
Misaligned accesses can be subject to interleaved accesses from other processors or cache-coherent
devices which can result in unintended behavior. Atomicity for misaligned accesses can be achieved
where necessary by using the XCHG instruction or any suitable LOCK-prefixed instruction.
Processors that report CPUID Fn0000_0001_ECX[AVX](bit 28) = 1 extend the atomicity for
cacheable, naturally-aligned single loads or stores from a quadword to a double quadword.

That is, AMD processors, similarly to Intel, do guarantee that for processors supporting AVX instructions 16-byte atomicity is provided by 16-byte load and store instructions.

On Intel and AMD processors that don't support AVX, the CMPXCHG16B instruction with the LOCK prefix can be used. You can use the CPUID instruction to figure out if your processor supports CMPXCHG16B (the "CX16" feature bit).

EDIT: Test program results

(Test program modified to increase #iterations by a factor of 10)

On a Xeon X3450 (x86-64):

0000   999998139       1572
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          0
1101           0          0
1110           0          0
1111        1861  999998428

On a Xeon 5150 (32-bit):

0000   999243100     283087
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          0
1101           0          0
1110           0          0
1111      756900  999716913

On an Opteron 2435 (x86-64):

0000   999995893       1901
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          0
1101           0          0
1110           0          0
1111        4107  999998099

Note that the Intel Xeon X3450 and Xeon 5150 don't support AVX. The Opteron 2435 is an AMD processor (K10 "Istanbul") that also does not support AVX.

Does this mean that Intel and/or AMD guarantee that 16 byte memory accesses are atomic on these machines? IMHO, it does not. It's not in the documentation as guaranteed architectural behavior, and thus one cannot know if on these particular processors 16 byte memory accesses really are atomic or whether the test program merely fails to trigger them for one reason or another. And thus relying on it is dangerous.

EDIT 2: How to make the test program fail

Ha! I managed to make the test program fail. On the same Opteron 2435 as above, with the same binary, but now running it via the "numactl" tool specifying that each thread runs on a separate socket, I got:

0000   999998634       5990
0001           0          0
0010           0          0
0011           0          0
0100           0          0
0101           0          0
0110           0          0
0111           0          0
1000           0          0
1001           0          0
1010           0          0
1011           0          0
1100           0          1  Not a single memory access!
1101           0          0
1110           0          0
1111        1366  999994009

So what does this imply? Well, the Opteron 2435 may, or may not, guarantee that 16-byte memory accesses are atomic for intra-socket accesses, but at least the cache coherency protocol running on the HyperTransport interconnect between the two sockets does not provide such a guarantee.

EDIT 3: ASM for the thread functions, on request of "GJ."

Here's the generated asm for the thread functions for the GCC 4.4 x86-64 version used on the Opteron 2435 system:


.globl thread2
        .type   thread2, @function
thread2:
.LFB537:
        .cfi_startproc
        movdqa  .LC3(%rip), %xmm1
        xorl    %eax, %eax
        .p2align 5,,24
        .p2align 3
.L11:
        movaps  x(%rip), %xmm0
        incl    %eax
        movaps  %xmm1, x(%rip)
        movmskps        %xmm0, %edx
        movslq  %edx, %rdx
        incl    n2(,%rdx,4)
        cmpl    $1000000000, %eax
        jne     .L11
        xorl    %eax, %eax
        ret
        .cfi_endproc
.LFE537:
        .size   thread2, .-thread2
        .p2align 5,,31
.globl thread1
        .type   thread1, @function
thread1:
.LFB536:
        .cfi_startproc
        pxor    %xmm1, %xmm1
        xorl    %eax, %eax
        .p2align 5,,24
        .p2align 3
.L15:
        movaps  x(%rip), %xmm0
        incl    %eax
        movaps  %xmm1, x(%rip)
        movmskps        %xmm0, %edx
        movslq  %edx, %rdx
        incl    n1(,%rdx,4)
        cmpl    $1000000000, %eax
        jne     .L15
        xorl    %eax, %eax
        ret
        .cfi_endproc

and for completeness, .LC3 which is the static data containing the (-1, -1, -1, -1) vector used by thread2:


.LC3:
        .long   -1
        .long   -1
        .long   -1
        .long   -1
        .ident  "GCC: (GNU) 4.4.4 20100726 (Red Hat 4.4.4-13)"
        .section        .note.GNU-stack,"",@progbits

Also note that this is AT&T ASM syntax, not the Intel syntax Windows programmers might be more familiar with. Finally, this is with march=native which makes GCC prefer MOVAPS; but it doesn't matter, if I use march=core2 it will use MOVDQA for storing to x, and I can still reproduce the failures.

回复收藏 0 原文

池木 2024-12-15 03:50:27

更新：2022 年，Intel 追溯记录了 AVX 功能位意味着对齐的 128 位加载/存储是原子的，至少对于 Intel CPU 而言是这样。 AMD 可以记录同样的事情，因为在实践中，他们支持 AVX 的 CPU 已经避免了 8 字节边界的撕裂。请参阅@whatishappened 的答案和 janneb 的更新答案。

具有 AVX 的奔腾和赛扬版本的 CPU 在实践中也将具有相同的原子性，但没有记录的软件检测它的方法。大概还有 Core 2 和 Nehalem，可能还有一些低功耗 Silvermont 系列芯片，这些芯片在 Alder Lake E 核心之前没有 AVX。

因此，最终我们可以以一种有据可查的方式在 AVX CPU 上进行廉价的原子 __int128 加载/存储。（因此 C++ std::atomic is_lock_free() 在某些机器上可能返回 true。
但 is_always_lock_free 不能作为编译时常量，除非 arch 选项生成需要 AVX 的二进制文件。 GCC 以前使用 lock cmpxchg16b 来实现加载/存储，但在 GCC7 IIRC 中进行了更改，不将其宣传为“无锁”，因为它没有您期望的适当支持下的读取端扩展.）

以下旧的部分更新答案

Erik Rigtorp 对最新的 Intel 和 AMD CPU 进行了一些实验测试，以查找撕裂情况。结果位于 https://rigtorp.se/isatomic/。请记住，没有关于此行为的文档或保证（超出 128 位或在非 AVX CPU 上），并且 IDK 是否可能使用此类 CPU 的自定义多插槽计算机的原子性低于他测试的计算机。但在当前的 x86 CPU（不是 K10）上，对齐加载/存储的 SIMD 原子性只是随高速缓存和 L1d 高速缓存之间的数据路径宽度进行缩放。

仅限 x86 ISA 保证事物的原子性高达8B，因此实现可以自由地实现SSE / AVX支持Pentium III / Pentium M / Core Duo的方式作用：内部数据按 64 位一半进行处理。一个 128 位存储由两个 64 位存储完成。在 Yonah 微架构（Core Duo）中，进出缓存的数据路径只有 64b 宽。（来源：Agner Fog 的微架构文档）。

最近的实现确实具有更宽的内部数据路径，并将 128b 指令作为单个操作处理。 Core 2 Duo (conroe/merom) 是第一个具有 128b 数据路径的 Intel P6 后裔微架构。（不知道 P4，但幸运的是它已经足够老了，完全不相关。）

这就是为什么 OP 发现 128b 操作在 Intel Core Duo (Yonah) 上不是原子的，但其他海报发现它们在后来的 Intel 设计中是原子的，开始与核心 2 (Merom)。

这篇有关 Merom 与 Yonah 的 Realworldtech 文章中的图表显示了 ALU 和 L1 之间的 128 位路径Merom（和 P4）中的数据缓存，而低功耗 Yonah 有 64 位数据路径。在所有 3 种设计中，L1 和 L2 高速缓存之间的数据路径均为 256b。

数据路径宽度的下一个跳跃来自英特尔的 Haswell，具有 256b (32B) AVX/AVX2 负载/stores，以及 L1 和 L2 缓存之间的 64 字节路径。我预计 256b 加载/存储在 Haswell、Broadwell 和 Skylake 中是原子的，但我没有要测试的。

Skylake-AVX512 具有 512 位数据路径，因此它们至少在读/写 L1d 缓存方面也自然是原子的。环形总线（客户端芯片）以 32 字节块进行传输，但 Intel 保证 32B 半块之间不会出现撕裂，因为只要不跨越缓存行边界，它们就能保证在任何未对齐情况下 8 字节加载/存储的原子性。

Zen 4 将 512 位操作分为两半，因此可能不是 512 位原子性。

正如 janneb 在他出色的实验答案中指出的那样，多核系统中套接字之间的缓存一致性协议可能比共享末级缓存 CPU 中获得的协议更窄。对于广泛的加载/存储，没有对原子性的架构要求，因此设计者可以自由地将它们在套接字内设为原子性，但如果方便的话，在套接字之间设为非原子性。我不知道 AMD 的 Bulldozer 系列或英特尔的插槽间逻辑数据路径有多宽。（我说“逻辑”，因为即使数据以较小的块传输，在完全接收之前它也可能不会修改缓存行。）

查找有关 AMD CPU 的类似文章应该可以得出关于 128b 操作是否是原子的合理结论。仅检查指令表会有所帮助：

K8 将 movaps reg, [mem] 解码为 2 m-ops，而 K10 和 bulldozer-family 将其解码为 1 m-ops。 AMD 的低功耗 bobcat 将其解码为 2 个 ops，而 jaguar 将 128b movaps 解码为 1 m-op。（它支持类似于推土机系列 CPU 的 AVX1：256b insns（甚至 ALU 操作）被分割成两个 128b 操作。Intel SnB 仅分割 256b 加载/存储，同时具有全宽 ALU。）

janneb 的 Opteron 2435 是一个 6 核 Istanbul CPU，属于 K10 系列，因此这个单 m-op ->原子结论在单个套接字内显得准确。

Intel Silvermont 使用单个 uop 执行 128b 加载/存储，每个时钟的吞吐量为 1。这与整数加载/存储相同，因此它很可能是原子的。

Update: in 2022, Intel retroactively documented that the AVX feature bit implies that aligned 128-bit loads/stores are atomic, at least for Intel CPUs. AMD could document the same thing since in practice their CPUs with AVX support have I think avoided tearing on 8-byte boundaries. See @whatishappened's answer, and janneb's updated answer.

Pentium and Celeron versions of CPUs with AVX will also have the same atomicity in practice, but no documented way for software to detect it. Also presumably Core 2 and Nehalem, and probably some low-power Silvermont-family chips, which haven't had AVX until Alder Lake E-cores.

So finally we can have cheapish atomic __int128 loads/stores on AVX CPUs in a well-documented way. (So C++ std::atomic is_lock_free() could return true on some machines.
But not is_always_lock_free as a compile-time constant unless arch options make a binary that requires AVX. GCC previously used lock cmpxchg16b to implement load/store, but changed in GCC7 IIRC to not advertise that as "lock free" since it didn't have the read-side scaling you'd expect with proper support.)

Old partly-updated answer below

Erik Rigtorp has done some experimental testing on recent Intel and AMD CPUs to look for tearing. Results at https://rigtorp.se/isatomic/. Keep in mind there's no documentation or guarantee about this behaviour (beyond 128-bit or on non-AVX CPUs), and IDK if it's possible for a custom many-socket machine using such CPUs to have less atomicity than the machines he tested on. But on current x86 CPUs (not K10), SIMD atomicity for aligned loads/stores simply scales with data-path width between cache and L1d cache.

The x86 ISA only guarantees atomicity for things up to 8B, so that implementations are free to implement SSE / AVX support the way Pentium III / Pentium M / Core Duo does: internally data is handled in 64bit halves. A 128-bit store is done as two 64-bit stores. The data path to/from cache is only 64b wide in the Yonah microarchitecture (Core Duo). (source:Agner Fog's microarch doc).

More recent implementations do have wider data paths internally, and handle 128b instructions as a single op. Core 2 Duo (conroe/merom) was the first Intel P6-descended microarch with 128b data paths. (IDK about P4, but fortunately it's old enough to be totally irrelevant.)

This is why the OP finds that 128b ops are not atomic on Intel Core Duo (Yonah), but other posters find that they are atomic on later Intel designs, starting with Core 2 (Merom).

The diagrams on this Realworldtech writeup about Merom vs. Yonah show the 128bit path between ALU and L1 data-cache in Merom (and P4), while the low-power Yonah has a 64bit data path. The data path between L1 and L2 cache is 256b in all 3 designs.

The next jump in data path width came with Intel's Haswell, featuring 256b (32B) AVX/AVX2 loads/stores, and a 64Byte path between L1 and L2 cache. I expect that 256b loads/stores are atomic in Haswell, Broadwell, and Skylake, but I don't have one to test.

Skylake-AVX512 has 512-bit data paths, so they're also naturally atomic at least in reading/writing L1d cache. The ring bus (client chips) transfers in 32-byte chunks, but Intel guarantees no tearing between 32B halves, since they guarantee atomicity for 8-byte load/store at any misalignment as long as it doesn't cross a cache-line boundary.

Zen 4 handles 512-bit ops as two halves, so probably not 512-bit atomicity.

As janneb points out in his excellent experimental answer, the cache-coherency protocol between sockets in a multi-core system might be narrower than what you get within a shared-last-level-cache CPU. There is no architectural requirement on atomicity for wide loads/stores, so designers are free to make them atomic within a socket but non-atomic across sockets if that's convenient. IDK how wide the inter-socket logical data path is for AMD's Bulldozer-family, or for Intel. (I say "logical", because even if the data is transferred in smaller chunks, it might not modify a cache line until it's fully received.)

Finding similar articles about AMD CPUs should allow drawing reasonable conclusions about whether 128b ops are atomic or not. Just checking instruction tables is some help:

K8 decodes movaps reg, [mem] to 2 m-ops, while K10 and bulldozer-family decode it to 1 m-op. AMD's low-power bobcat decodes it to 2 ops, while jaguar decodes 128b movaps to 1 m-op. (It supports AVX1 similar to bulldozer-family CPUs: 256b insns (even ALU ops) are split into two 128b ops. Intel SnB only splits 256b loads/stores, while having full-width ALUs.)

janneb's Opteron 2435 is a 6-core Istanbul CPU, which is part of the K10 family, so this single-m-op -> atomic conclusion appears accurate within a single socket.

Intel Silvermont does 128b loads/stores with a single uop, and a throughput of one per clock. This is the same as for integer loads/stores, so it's quite probably atomic.

回复收藏 0 原文

不顾 2024-12-15 03:50:27

“AMD 架构程序员手册第 1 卷：应用程序编程” 第 3.9 节中介绍。 1：“CMPXCHG16B 可用于在 64 位模式下执行 16 字节原子访问（具有一定的对齐限制）。”

不过，上证所指令却没有这样的评论。事实上，4.8.3 中有一条注释说 LOCK 前缀“与 128 位媒体指令一起使用时会导致无效操作码异常”。因此，我认为 AMD 处理器不保证 SSE 指令的原子 128 位访问，而进行原子 128 位访问的唯一方法是使用 CMPXCHG16B。

“Intel 64 和 IA-32 架构软件开发人员手册第 3A 卷：系统编程指南，第 1 部分”在 8.1.1 中说道“访问大于四字的数据的 x87 指令或 SSE 指令可以使用多个内存访问来实现。”可以肯定的是，128 位 SSE 指令不能由 ISA 保证原子性。英特尔文档第 2A 卷谈到 CMPXCHG16B 时说道：“该指令可以与 LOCK 前缀一起使用，以允许原子地执行该指令。”

此外，CPU 制造商尚未针对特定 CPU 型号发布原子 128b SSE 操作的书面保证。

回复收藏 0 原文

半岛未凉 2024-12-15 03:50:27

实际上，Intel 架构手册第 3A 卷中有一个警告。第 8.1.1 节（2011 年 5 月），在保证原子操作部分下：

访问更大数据的 x87 指令或 SSE 指令
可以使用多个存储器访问来实现四字。如果
这样的指令存储到内存，一些访问可能
完成（写入内存），而另一个导致操作
由于架构原因（例如，由于页表条目是
标记为“不存在”）。在这种情况下，完成的效果
即使整体访问权限可能对软件可见
指令导致故障。如果 TLB 失效已被延迟（请参阅
第 4.10.4.4 节），即使所有访问都已完成，此类页面错误也可能发生
到同一页面。

因此，即使底层架构确实使用单个内存访问，SSE 指令也不能保证是原子的（这是引入内存防护的原因之一）。

将其与英特尔优化手册第 13.3 节（2011 年 4 月）中的此声明结合起来

AVX和FMA指令没有引入任何新的保证原子性
内存操作。

事实上，SIMD 的加载或存储操作都不能保证原子性，我们可以得出这样的结论：英特尔尚不支持任何形式的原子 SIMD。

作为额外的一点，如果内存沿缓存行或页边界分割（当使用诸如允许未对齐访问的 movdqu 之类的东西时），以下处理器将不会执行原子访问，无论对齐如何，但更高版本的处理器将（再次来自英特尔架构手册）：

英特尔酷睿 2 双核、英特尔® 凌动™、英特尔酷睿双核、奔腾 M、奔腾 4、
Intel Xeon、P6 系列、Pentium 和 Intel486 处理器。英特尔
酷睿 2 双核、英特尔凌动、英特尔酷睿双核、奔腾 M、奔腾 4、英特尔
Xeon 和 P6 系列处理器

回复收藏 0 原文

放手` 2024-12-15 03:50:27

看起来 AMD 还将在其手册的下一版本中指定，对齐 16b 加载和存储在支持 AVX 的 x86 处理器上是原子的。（来源）

抱歉回复晚了！
我们将在下一个版本中更新 AMD APM 手册。
对于所有 AMD 架构，
支持 AVX 的处理器将可缓存、自然对齐的单个加载或存储的原子性从四字扩展为双四字。
这意味着所有 128b 指令，甚至是 *MOVDQU 指令，如果最终自然对齐，都是原子的。
我们能否将此补丁也扩展到 AMD 处理器。如果没有，我将计划提交 stage-1 的补丁！

有了这个，libatomic 的补丁不仅在具有 AVX 的 Intel 处理器上，而且在具有 AVX 的 AMD 处理器上，在其实现 __atomic_load_16 和 __atomic_store_16 中使用 vmovdqa 。 AVX 有登陆master分支。

回复收藏 0 原文

就像说晚安 2024-12-15 03:50:27

编辑：
在过去的两天里，我在我的三台电脑上进行了几次测试，没有重现任何内存错误，所以我不能说得更准确。也许这个内存错误也依赖于操作系统。

编辑：
我正在用 Delphi 编程，而不是用 C，但我应该理解 C。所以我已经翻译了代码，这里有线程程序，其中主要部分是用汇编程序编写的：

procedure TThread1.Execute;
var
  n             :cardinal;
const
  ConstAll0     :array[0..3] of integer =(0,0,0,0);
begin
  for n := 0 to 100000000 do
    asm
      movdqa    xmm0, dqword [x]
      movmskps  eax, xmm0
      inc       dword ptr[n1 + eax *4]
      movdqu    xmm0, dqword [ConstAll0]
      movdqa    dqword [x], xmm0
    end;
end;

{ TThread2 }

procedure TThread2.Execute;
var
  n             :cardinal;
const
  ConstAll1     :array[0..3] of integer =(-1,-1,-1,-1);
begin
  for n := 0 to 100000000 do
    asm
      movdqa    xmm0, dqword [x]
      movmskps  eax, xmm0
      inc       dword ptr[n2 + eax *4]
      movdqu    xmm0, dqword [ConstAll1]
      movdqa    dqword [x], xmm0
    end;
end;

结果：在我的四核上没有错误PC，并且在我的双核 PC 上没有出现预期的错误！

采用 Intel Pentium4 CPU 的
PC 采用 Intel Core2 四核 CPU Q6600 的
PC 采用 Intel Core2 Duo CPU P8400

您能展示调试器如何查看您的线程程序代码吗？请...

EDIT:
In the last two days I have made several tests on my three PCs and I didn't reproduce any memory error, so I can't say anything more precisely. Maybe is this memory error also dependent from OS.

EDIT:
I'm programing in Delphi and not in C but I should understand C. So I have translated the code, here are you have the threads procedures where the main part is made in assembler:

procedure TThread1.Execute;
var
  n             :cardinal;
const
  ConstAll0     :array[0..3] of integer =(0,0,0,0);
begin
  for n := 0 to 100000000 do
    asm
      movdqa    xmm0, dqword [x]
      movmskps  eax, xmm0
      inc       dword ptr[n1 + eax *4]
      movdqu    xmm0, dqword [ConstAll0]
      movdqa    dqword [x], xmm0
    end;
end;

{ TThread2 }

procedure TThread2.Execute;
var
  n             :cardinal;
const
  ConstAll1     :array[0..3] of integer =(-1,-1,-1,-1);
begin
  for n := 0 to 100000000 do
    asm
      movdqa    xmm0, dqword [x]
      movmskps  eax, xmm0
      inc       dword ptr[n2 + eax *4]
      movdqu    xmm0, dqword [ConstAll1]
      movdqa    dqword [x], xmm0
    end;
end;

Result: no mistake on my quad core PC and no mistake on my dual core PC as expected!