卷积函数延迟瓶颈

发布于 2025-01-12 15:18:46 字数 15769 浏览 4 评论 0原文

我已经用 C 语言实现了一个卷积神经网络，并一直在研究它的哪些部分具有最长的延迟。

根据我的研究，CNN 所需的大量矩阵乘法使得它们在 CPU 甚至 GPU 上运行效率非常低。然而，当我实际分析我的代码（在未优化的构建上）时，我发现除乘法本身之外的其他东西是实现的瓶颈。

开启优化后（-O3 -march=native -ffast-math，gcc交叉编译器），Gprof结果如下：

，卷积2D函数的运行时间最长，其次是批量归一化函数和深度卷积函数。

有问题的卷积函数如下所示：

void convolution2D(int isize,   // width/height of input
        int osize,              // width/height of output
        int ksize,              // width/height of kernel
        int stride,             // shift between input pixels, between consecutive outputs
        int pad,                // offset between (0,0) pixels between input and output
        int idepth, int odepth, // number of input and output channels
        float idata[isize][isize][idepth],
        float odata[osize][osize][odepth],
        float kdata[odepth][ksize][ksize][idepth])
{
    // iterate over the output
    for (int oy = 0; oy < osize; ++oy) {
    for (int ox = 0; ox < osize; ++ox) {
    for (int od = 0; od < odepth; ++od) {
        odata[oy][ox][od] = 0;  // When you iterate multiple times without closing the program, this number would stack up to infinity, so we have to zero it out every time.
        for (int ky = 0; ky < ksize; ++ky) {
        for (int kx = 0; kx < ksize; ++kx) {
            // map position in output and kernel to the input
            int iy = stride * oy + ky - pad;
            int ix = stride * ox + kx - pad;
            // use only valid inputs
            if (iy >= 0 && iy < isize && ix >= 0 && ix < isize)
                for (int id = 0; id < idepth; ++id)
                    odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];
        }}
    }}}

}

这是基于我之前的问题，大部分处理时间应该落在卷积本身上：odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];。

使用 objdump -drwC -Mintel 来查看汇编代码会返回以下内容：


0000000000007880 <convolution2D>:
    7880:   f3 0f 1e fa             endbr64 
    7884:   55                      push   rbp
    7885:   48 89 e5                mov    rbp,rsp
    7888:   41 57                   push   r15
    788a:   41 56                   push   r14
    788c:   41 55                   push   r13
    788e:   41 54                   push   r12
    7890:   53                      push   rbx
    7891:   48 81 ec b0 00 00 00    sub    rsp,0xb0
    7898:   ff 15 4a a7 00 00       call   QWORD PTR [rip+0xa74a]        # 11fe8 <mcount@GLIBC_2.2.5>
    789e:   89 d3                   mov    ebx,edx
    78a0:   89 55 a8                mov    DWORD PTR [rbp-0x58],edx
    78a3:   89 8d 74 ff ff ff       mov    DWORD PTR [rbp-0x8c],ecx
    78a9:   49 63 d1                movsxd rdx,r9d
    78ac:   48 63 cf                movsxd rcx,edi
    78af:   41 89 f2                mov    r10d,esi
    78b2:   89 b5 38 ff ff ff       mov    DWORD PTR [rbp-0xc8],esi
    78b8:   49 63 c0                movsxd rax,r8d
    78bb:   48 0f af ca             imul   rcx,rdx
    78bf:   48 63 75 10             movsxd rsi,DWORD PTR [rbp+0x10]
    78c3:   49 89 d6                mov    r14,rdx
    78c6:   4c 8d 24 95 00 00 00 00     lea    r12,[rdx*4+0x0]
    78ce:   41 89 fd                mov    r13d,edi
    78d1:   49 89 cb                mov    r11,rcx
    78d4:   48 89 8d 60 ff ff ff    mov    QWORD PTR [rbp-0xa0],rcx
    78db:   49 63 ca                movsxd rcx,r10d
    78de:   4c 8d 0c b5 00 00 00 00     lea    r9,[rsi*4+0x0]
    78e6:   49 89 f0                mov    r8,rsi
    78e9:   48 0f af f1             imul   rsi,rcx
    78ed:   48 63 cb                movsxd rcx,ebx
    78f0:   4c 89 8d 48 ff ff ff    mov    QWORD PTR [rbp-0xb8],r9
    78f7:   48 0f af d1             imul   rdx,rcx
    78fb:   48 8d 3c 95 00 00 00 00     lea    rdi,[rdx*4+0x0]
    7903:   45 85 d2                test   r10d,r10d
    7906:   0f 8e 73 02 00 00       jle    7b7f <convolution2D+0x2ff>
    790c:   48 c1 ef 02             shr    rdi,0x2
    7910:   49 c1 e9 02             shr    r9,0x2
    7914:   48 89 7d c8             mov    QWORD PTR [rbp-0x38],rdi
    7918:   4c 89 e7                mov    rdi,r12
    791b:   4c 89 8d 58 ff ff ff    mov    QWORD PTR [rbp-0xa8],r9
    7922:   48 c1 ef 02             shr    rdi,0x2
    7926:   48 89 bd 50 ff ff ff    mov    QWORD PTR [rbp-0xb0],rdi
    792d:   45 85 c0                test   r8d,r8d
    7930:   0f 8e 49 02 00 00       jle    7b7f <convolution2D+0x2ff>
    7936:   48 c1 e6 02             shl    rsi,0x2
    793a:   48 0f af d1             imul   rdx,rcx
    793e:   29 c3                   sub    ebx,eax
    7940:   89 c7                   mov    edi,eax
    7942:   48 89 b5 30 ff ff ff    mov    QWORD PTR [rbp-0xd0],rsi
    7949:   48 8b 75 20             mov    rsi,QWORD PTR [rbp+0x20]
    794d:   48 89 85 68 ff ff ff    mov    QWORD PTR [rbp-0x98],rax
    7954:   f7 df                   neg    edi
    7956:   45 8d 7e ff             lea    r15d,[r14-0x1]
    795a:   89 9d 70 ff ff ff       mov    DWORD PTR [rbp-0x90],ebx
    7960:   89 bd 3c ff ff ff       mov    DWORD PTR [rbp-0xc4],edi
    7966:   48 8d 0c 95 00 00 00 00     lea    rcx,[rdx*4+0x0]
    796e:   89 7d ac                mov    DWORD PTR [rbp-0x54],edi
    7971:   89 5d d4                mov    DWORD PTR [rbp-0x2c],ebx
    7974:   48 89 4d 98             mov    QWORD PTR [rbp-0x68],rcx
    7978:   4a 8d 0c 9d 00 00 00 00     lea    rcx,[r11*4+0x0]
    7980:   c7 45 80 00 00 00 00    mov    DWORD PTR [rbp-0x80],0x0
    7987:   48 89 75 88             mov    QWORD PTR [rbp-0x78],rsi
    798b:   41 8d 70 ff             lea    esi,[r8-0x1]
    798f:   48 89 4d c0             mov    QWORD PTR [rbp-0x40],rcx
    7993:   48 8d 04 b5 04 00 00 00     lea    rax,[rsi*4+0x4]
    799b:   c7 45 90 00 00 00 00    mov    DWORD PTR [rbp-0x70],0x0
    79a2:   48 89 85 28 ff ff ff    mov    QWORD PTR [rbp-0xd8],rax
    79a9:   44 89 f0                mov    eax,r14d
    79ac:   45 89 ee                mov    r14d,r13d
    79af:   41 89 c5                mov    r13d,eax
    79b2:   48 8b 85 28 ff ff ff    mov    rax,QWORD PTR [rbp-0xd8]
    79b9:   48 03 45 88             add    rax,QWORD PTR [rbp-0x78]
    79bd:   48 c7 85 78 ff ff ff 00 00 00 00    mov    QWORD PTR [rbp-0x88],0x0
    79c8:   c7 45 84 00 00 00 00    mov    DWORD PTR [rbp-0x7c],0x0
    79cf:   c7 45 94 00 00 00 00    mov    DWORD PTR [rbp-0x6c],0x0
    79d6:   44 8b 95 70 ff ff ff    mov    r10d,DWORD PTR [rbp-0x90]
    79dd:   48 89 45 b0             mov    QWORD PTR [rbp-0x50],rax
    79e1:   48 63 45 80             movsxd rax,DWORD PTR [rbp-0x80]
    79e5:   48 2b 85 68 ff ff ff    sub    rax,QWORD PTR [rbp-0x98]
    79ec:   48 0f af 85 60 ff ff ff     imul   rax,QWORD PTR [rbp-0xa0]
    79f4:   48 89 85 40 ff ff ff    mov    QWORD PTR [rbp-0xc0],rax
    79fb:   8b 85 3c ff ff ff       mov    eax,DWORD PTR [rbp-0xc4]
    7a01:   89 45 d0                mov    DWORD PTR [rbp-0x30],eax
    7a04:   48 8b 45 88             mov    rax,QWORD PTR [rbp-0x78]
    7a08:   48 8b 9d 78 ff ff ff    mov    rbx,QWORD PTR [rbp-0x88]
    7a0f:   4c 8d 04 98             lea    r8,[rax+rbx*4]
    7a13:   48 8b 45 28             mov    rax,QWORD PTR [rbp+0x28]
    7a17:   48 8b 5d 18             mov    rbx,QWORD PTR [rbp+0x18]
    7a1b:   48 89 45 b8             mov    QWORD PTR [rbp-0x48],rax
    7a1f:   48 63 45 84             movsxd rax,DWORD PTR [rbp-0x7c]
    7a23:   48 2b 85 68 ff ff ff    sub    rax,QWORD PTR [rbp-0x98]
    7a2a:   48 0f af 85 50 ff ff ff     imul   rax,QWORD PTR [rbp-0xb0]
    7a32:   48 03 85 40 ff ff ff    add    rax,QWORD PTR [rbp-0xc0]
    7a39:   48 8d 04 83             lea    rax,[rbx+rax*4]
    7a3d:   48 89 45 a0             mov    QWORD PTR [rbp-0x60],rax
    7a41:   66 66 2e 0f 1f 84 00 00 00 00 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
    7a4c:   0f 1f 40 00             nop    DWORD PTR [rax+0x0]
    7a50:   8b 45 a8                mov    eax,DWORD PTR [rbp-0x58]
    7a53:   41 c7 00 00 00 00 00    mov    DWORD PTR [r8],0x0
    7a5a:   45 31 db                xor    r11d,r11d
    7a5d:   48 8b 5d a0             mov    rbx,QWORD PTR [rbp-0x60]
    7a61:   44 8b 4d ac             mov    r9d,DWORD PTR [rbp-0x54]
    7a65:   85 c0                   test   eax,eax
    7a67:   0f 8e 98 00 00 00       jle    7b05 <convolution2D+0x285>
    7a6d:   0f 1f 00                nop    DWORD PTR [rax]
    7a70:   45 85 c9                test   r9d,r9d
    7a73:   78 7b                   js     7af0 <convolution2D+0x270>
    7a75:   45 39 ce                cmp    r14d,r9d
    7a78:   7e 76                   jle    7af0 <convolution2D+0x270>
    7a7a:   48 8b 45 b8             mov    rax,QWORD PTR [rbp-0x48]
    7a7e:   8b 55 d0                mov    edx,DWORD PTR [rbp-0x30]
    7a81:   48 89 de                mov    rsi,rbx
    7a84:   4a 8d 3c 98             lea    rdi,[rax+r11*4]
    7a88:   eb 13                   jmp    7a9d <convolution2D+0x21d>
    7a8a:   66 0f 1f 44 00 00       nop    WORD PTR [rax+rax*1+0x0]
    7a90:   ff c2                   inc    edx
    7a92:   4c 01 e7                add    rdi,r12
    7a95:   4c 01 e6                add    rsi,r12
    7a98:   44 39 d2                cmp    edx,r10d
    7a9b:   74 53                   je     7af0 <convolution2D+0x270>
    7a9d:   85 d2                   test   edx,edx
    7a9f:   78 ef                   js     7a90 <convolution2D+0x210>
    7aa1:   41 39 d6                cmp    r14d,edx
    7aa4:   7e ea                   jle    7a90 <convolution2D+0x210>
    7aa6:   45 85 ed                test   r13d,r13d
    7aa9:   7e e5                   jle    7a90 <convolution2D+0x210>
    7aab:   c4 c1 7a 10 08          vmovss xmm1,DWORD PTR [r8]
    7ab0:   31 c0                   xor    eax,eax
    7ab2:   66 66 2e 0f 1f 84 00 00 00 00 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
    7abd:   0f 1f 00                nop    DWORD PTR [rax]
    7ac0:   c5 fa 10 04 87          vmovss xmm0,DWORD PTR [rdi+rax*4]
    7ac5:   48 89 c1                mov    rcx,rax
    7ac8:   c5 fa 59 04 86          vmulss xmm0,xmm0,DWORD PTR [rsi+rax*4]
    7acd:   48 ff c0                inc    rax
    7ad0:   c5 f2 58 c8             vaddss xmm1,xmm1,xmm0
    7ad4:   c4 c1 7a 11 08          vmovss DWORD PTR [r8],xmm1
    7ad9:   49 39 cf                cmp    r15,rcx
    7adc:   75 e2                   jne    7ac0 <convolution2D+0x240>
    7ade:   ff c2                   inc    edx
    7ae0:   4c 01 e7                add    rdi,r12
    7ae3:   4c 01 e6                add    rsi,r12
    7ae6:   44 39 d2                cmp    edx,r10d
    7ae9:   75 b2                   jne    7a9d <convolution2D+0x21d>
    7aeb:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
    7af0:   4c 03 5d c8             add    r11,QWORD PTR [rbp-0x38]
    7af4:   48 03 5d c0             add    rbx,QWORD PTR [rbp-0x40]
    7af8:   41 ff c1                inc    r9d
    7afb:   44 3b 4d d4             cmp    r9d,DWORD PTR [rbp-0x2c]
    7aff:   0f 85 6b ff ff ff       jne    7a70 <convolution2D+0x1f0>
    7b05:   48 8b 5d 98             mov    rbx,QWORD PTR [rbp-0x68]
    7b09:   49 83 c0 04             add    r8,0x4
    7b0d:   48 01 5d b8             add    QWORD PTR [rbp-0x48],rbx
    7b11:   4c 3b 45 b0             cmp    r8,QWORD PTR [rbp-0x50]
    7b15:   0f 85 35 ff ff ff       jne    7a50 <convolution2D+0x1d0>
    7b1b:   8b 9d 74 ff ff ff       mov    ebx,DWORD PTR [rbp-0x8c]
    7b21:   8b 45 94                mov    eax,DWORD PTR [rbp-0x6c]
    7b24:   48 8b 8d 48 ff ff ff    mov    rcx,QWORD PTR [rbp-0xb8]
    7b2b:   01 5d d0                add    DWORD PTR [rbp-0x30],ebx
    7b2e:   48 01 4d b0             add    QWORD PTR [rbp-0x50],rcx
    7b32:   01 5d 84                add    DWORD PTR [rbp-0x7c],ebx
    7b35:   48 8b 8d 58 ff ff ff    mov    rcx,QWORD PTR [rbp-0xa8]
    7b3c:   41 01 da                add    r10d,ebx
    7b3f:   48 01 8d 78 ff ff ff    add    QWORD PTR [rbp-0x88],rcx
    7b46:   ff c0                   inc    eax
    7b48:   39 85 38 ff ff ff       cmp    DWORD PTR [rbp-0xc8],eax
    7b4e:   74 08                   je     7b58 <convolution2D+0x2d8>
    7b50:   89 45 94                mov    DWORD PTR [rbp-0x6c],eax
    7b53:   e9 ac fe ff ff          jmp    7a04 <convolution2D+0x184>
    7b58:   8b 4d 90                mov    ecx,DWORD PTR [rbp-0x70]
    7b5b:   48 8b b5 30 ff ff ff    mov    rsi,QWORD PTR [rbp-0xd0]
    7b62:   01 5d d4                add    DWORD PTR [rbp-0x2c],ebx
    7b65:   01 5d ac                add    DWORD PTR [rbp-0x54],ebx
    7b68:   01 5d 80                add    DWORD PTR [rbp-0x80],ebx
    7b6b:   48 01 75 88             add    QWORD PTR [rbp-0x78],rsi
    7b6f:   8d 41 01                lea    eax,[rcx+0x1]
    7b72:   39 4d 94                cmp    DWORD PTR [rbp-0x6c],ecx
    7b75:   74 08                   je     7b7f <convolution2D+0x2ff>
    7b77:   89 45 90                mov    DWORD PTR [rbp-0x70],eax
    7b7a:   e9 33 fe ff ff          jmp    79b2 <convolution2D+0x132>
    7b7f:   48 81 c4 b0 00 00 00    add    rsp,0xb0
    7b86:   5b                      pop    rbx
    7b87:   41 5c                   pop    r12
    7b89:   41 5d                   pop    r13
    7b8b:   41 5e                   pop    r14
    7b8d:   41 5f                   pop    r15
    7b8f:   5d                      pop    rbp
    7b90:   c3                      ret    
    7b91:   66 66 2e 0f 1f 84 00 00 00 00 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
    7b9c:   0f 1f 40 00             nop    DWORD PTR [rax+0x0]

作为参考，我使用的是采用 Zen2 架构的 AMD Ryzen 7 CPU。这是其指令列表（第101页）。

我怀疑这里的数据指向内存问题，而不仅仅是乘法是造成瓶颈的原因。

问题：

如何改进此代码，使其不会导致内存瓶颈？

我猜这实际上是我的代码特有的问题，也许与我正在使用的多维数组有关。如果我为每个变量使用一个大的一维数组，延迟会减少吗？

相关信息：

我有两种方法声明传递给该函数的变量。第一个是作为全局变量（通常在结构中），第二个是作为动态分配：

float (*arr)[x][y] = calloc(z, sizeof *arr);

也许我声明这些矩阵的顺序不适合缓存，但我不确定如何重新排序。

前一个函数的步幅值始终为 1 或 2，通常为 1。

以下是 valgrind --tool=cachegrind 的输出：

==430300== Cachegrind, a cache and branch-prediction profiler
==430300== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==430300== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==430300== Command: ./EmbeddedNet test 1
==430300== Parent PID: 170008
==430300== 
--430300-- warning: L3 cache found, using its data for the LL simulation.
==430300== 
==430300== I   refs:      6,369,594,192
==430300== I1  misses:            4,271
==430300== LLi misses:            2,442
==430300== I1  miss rate:          0.00%
==430300== LLi miss rate:          0.00%
==430300== 
==430300== D   refs:      2,064,233,110  (1,359,003,131 rd   + 705,229,979 wr)
==430300== D1  misses:       34,476,969  (   19,010,839 rd   +  15,466,130 wr)
==430300== LLd misses:        5,311,277  (    1,603,955 rd   +   3,707,322 wr)
==430300== D1  miss rate:           1.7% (          1.4%     +         2.2%  )
==430300== LLd miss rate:           0.3% (          0.1%     +         0.5%  )
==430300== 
==430300== LL refs:          34,481,240  (   19,015,110 rd   +  15,466,130 wr)
==430300== LL misses:         5,313,719  (    1,606,397 rd   +   3,707,322 wr)
==430300== LL miss rate:            0.1% (          0.0%     +         0.5%  )

原文

I have implemented a Convolutional Neural Network in C and have been studying what parts of it have the longest latency.

Based on my research, the massive amounts of matricial multiplication required by CNNs makes running them on CPUs and even GPUs very inefficient. However, when I actually profiled my code (on an unoptimized build) I found out that something other than the multiplication itself was the bottleneck of the implementation.

After turning on optimization (-O3 -march=native -ffast-math, gcc cross compiler), the Gprof result was the following:

Clearly, the convolution2D function takes the largest amount of time to run, followed by the batch normalization and depthwise convolution functions.

The convolution function in question looks like this:

void convolution2D(int isize,   // width/height of input
        int osize,              // width/height of output
        int ksize,              // width/height of kernel
        int stride,             // shift between input pixels, between consecutive outputs
        int pad,                // offset between (0,0) pixels between input and output
        int idepth, int odepth, // number of input and output channels
        float idata[isize][isize][idepth],
        float odata[osize][osize][odepth],
        float kdata[odepth][ksize][ksize][idepth])
{
    // iterate over the output
    for (int oy = 0; oy < osize; ++oy) {
    for (int ox = 0; ox < osize; ++ox) {
    for (int od = 0; od < odepth; ++od) {
        odata[oy][ox][od] = 0;  // When you iterate multiple times without closing the program, this number would stack up to infinity, so we have to zero it out every time.
        for (int ky = 0; ky < ksize; ++ky) {
        for (int kx = 0; kx < ksize; ++kx) {
            // map position in output and kernel to the input
            int iy = stride * oy + ky - pad;
            int ix = stride * ox + kx - pad;
            // use only valid inputs
            if (iy >= 0 && iy < isize && ix >= 0 && ix < isize)
                for (int id = 0; id < idepth; ++id)
                    odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];
        }}
    }}}

}

This is a design based on my previous question and most of the processing time should fall on the convolution itself: odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];.

Using objdump -drwC -Mintel to take a look at the assembly code returns me the following:


0000000000007880 <convolution2D>:
    7880:   f3 0f 1e fa             endbr64 
    7884:   55                      push   rbp
    7885:   48 89 e5                mov    rbp,rsp
    7888:   41 57                   push   r15
    788a:   41 56                   push   r14
    788c:   41 55                   push   r13
    788e:   41 54                   push   r12
    7890:   53                      push   rbx
    7891:   48 81 ec b0 00 00 00    sub    rsp,0xb0
    7898:   ff 15 4a a7 00 00       call   QWORD PTR [rip+0xa74a]        # 11fe8 <mcount@GLIBC_2.2.5>
    789e:   89 d3                   mov    ebx,edx
    78a0:   89 55 a8                mov    DWORD PTR [rbp-0x58],edx
    78a3:   89 8d 74 ff ff ff       mov    DWORD PTR [rbp-0x8c],ecx
    78a9:   49 63 d1                movsxd rdx,r9d
    78ac:   48 63 cf                movsxd rcx,edi
    78af:   41 89 f2                mov    r10d,esi
    78b2:   89 b5 38 ff ff ff       mov    DWORD PTR [rbp-0xc8],esi
    78b8:   49 63 c0                movsxd rax,r8d
    78bb:   48 0f af ca             imul   rcx,rdx
    78bf:   48 63 75 10             movsxd rsi,DWORD PTR [rbp+0x10]
    78c3:   49 89 d6                mov    r14,rdx
    78c6:   4c 8d 24 95 00 00 00 00     lea    r12,[rdx*4+0x0]
    78ce:   41 89 fd                mov    r13d,edi
    78d1:   49 89 cb                mov    r11,rcx
    78d4:   48 89 8d 60 ff ff ff    mov    QWORD PTR [rbp-0xa0],rcx
    78db:   49 63 ca                movsxd rcx,r10d
    78de:   4c 8d 0c b5 00 00 00 00     lea    r9,[rsi*4+0x0]
    78e6:   49 89 f0                mov    r8,rsi
    78e9:   48 0f af f1             imul   rsi,rcx
    78ed:   48 63 cb                movsxd rcx,ebx
    78f0:   4c 89 8d 48 ff ff ff    mov    QWORD PTR [rbp-0xb8],r9
    78f7:   48 0f af d1             imul   rdx,rcx
    78fb:   48 8d 3c 95 00 00 00 00     lea    rdi,[rdx*4+0x0]
    7903:   45 85 d2                test   r10d,r10d
    7906:   0f 8e 73 02 00 00       jle    7b7f <convolution2D+0x2ff>
    790c:   48 c1 ef 02             shr    rdi,0x2
    7910:   49 c1 e9 02             shr    r9,0x2
    7914:   48 89 7d c8             mov    QWORD PTR [rbp-0x38],rdi
    7918:   4c 89 e7                mov    rdi,r12
    791b:   4c 89 8d 58 ff ff ff    mov    QWORD PTR [rbp-0xa8],r9
    7922:   48 c1 ef 02             shr    rdi,0x2
    7926:   48 89 bd 50 ff ff ff    mov    QWORD PTR [rbp-0xb0],rdi
    792d:   45 85 c0                test   r8d,r8d
    7930:   0f 8e 49 02 00 00       jle    7b7f <convolution2D+0x2ff>
    7936:   48 c1 e6 02             shl    rsi,0x2
    793a:   48 0f af d1             imul   rdx,rcx
    793e:   29 c3                   sub    ebx,eax
    7940:   89 c7                   mov    edi,eax
    7942:   48 89 b5 30 ff ff ff    mov    QWORD PTR [rbp-0xd0],rsi
    7949:   48 8b 75 20             mov    rsi,QWORD PTR [rbp+0x20]
    794d:   48 89 85 68 ff ff ff    mov    QWORD PTR [rbp-0x98],rax
    7954:   f7 df                   neg    edi
    7956:   45 8d 7e ff             lea    r15d,[r14-0x1]
    795a:   89 9d 70 ff ff ff       mov    DWORD PTR [rbp-0x90],ebx
    7960:   89 bd 3c ff ff ff       mov    DWORD PTR [rbp-0xc4],edi
    7966:   48 8d 0c 95 00 00 00 00     lea    rcx,[rdx*4+0x0]
    796e:   89 7d ac                mov    DWORD PTR [rbp-0x54],edi
    7971:   89 5d d4                mov    DWORD PTR [rbp-0x2c],ebx
    7974:   48 89 4d 98             mov    QWORD PTR [rbp-0x68],rcx
    7978:   4a 8d 0c 9d 00 00 00 00     lea    rcx,[r11*4+0x0]
    7980:   c7 45 80 00 00 00 00    mov    DWORD PTR [rbp-0x80],0x0
    7987:   48 89 75 88             mov    QWORD PTR [rbp-0x78],rsi
    798b:   41 8d 70 ff             lea    esi,[r8-0x1]
    798f:   48 89 4d c0             mov    QWORD PTR [rbp-0x40],rcx
    7993:   48 8d 04 b5 04 00 00 00     lea    rax,[rsi*4+0x4]
    799b:   c7 45 90 00 00 00 00    mov    DWORD PTR [rbp-0x70],0x0
    79a2:   48 89 85 28 ff ff ff    mov    QWORD PTR [rbp-0xd8],rax
    79a9:   44 89 f0                mov    eax,r14d
    79ac:   45 89 ee                mov    r14d,r13d
    79af:   41 89 c5                mov    r13d,eax
    79b2:   48 8b 85 28 ff ff ff    mov    rax,QWORD PTR [rbp-0xd8]
    79b9:   48 03 45 88             add    rax,QWORD PTR [rbp-0x78]
    79bd:   48 c7 85 78 ff ff ff 00 00 00 00    mov    QWORD PTR [rbp-0x88],0x0
    79c8:   c7 45 84 00 00 00 00    mov    DWORD PTR [rbp-0x7c],0x0
    79cf:   c7 45 94 00 00 00 00    mov    DWORD PTR [rbp-0x6c],0x0
    79d6:   44 8b 95 70 ff ff ff    mov    r10d,DWORD PTR [rbp-0x90]
    79dd:   48 89 45 b0             mov    QWORD PTR [rbp-0x50],rax
    79e1:   48 63 45 80             movsxd rax,DWORD PTR [rbp-0x80]
    79e5:   48 2b 85 68 ff ff ff    sub    rax,QWORD PTR [rbp-0x98]
    79ec:   48 0f af 85 60 ff ff ff     imul   rax,QWORD PTR [rbp-0xa0]
    79f4:   48 89 85 40 ff ff ff    mov    QWORD PTR [rbp-0xc0],rax
    79fb:   8b 85 3c ff ff ff       mov    eax,DWORD PTR [rbp-0xc4]
    7a01:   89 45 d0                mov    DWORD PTR [rbp-0x30],eax
    7a04:   48 8b 45 88             mov    rax,QWORD PTR [rbp-0x78]
    7a08:   48 8b 9d 78 ff ff ff    mov    rbx,QWORD PTR [rbp-0x88]
    7a0f:   4c 8d 04 98             lea    r8,[rax+rbx*4]
    7a13:   48 8b 45 28             mov    rax,QWORD PTR [rbp+0x28]
    7a17:   48 8b 5d 18             mov    rbx,QWORD PTR [rbp+0x18]
    7a1b:   48 89 45 b8             mov    QWORD PTR [rbp-0x48],rax
    7a1f:   48 63 45 84             movsxd rax,DWORD PTR [rbp-0x7c]
    7a23:   48 2b 85 68 ff ff ff    sub    rax,QWORD PTR [rbp-0x98]
    7a2a:   48 0f af 85 50 ff ff ff     imul   rax,QWORD PTR [rbp-0xb0]
    7a32:   48 03 85 40 ff ff ff    add    rax,QWORD PTR [rbp-0xc0]
    7a39:   48 8d 04 83             lea    rax,[rbx+rax*4]
    7a3d:   48 89 45 a0             mov    QWORD PTR [rbp-0x60],rax
    7a41:   66 66 2e 0f 1f 84 00 00 00 00 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
    7a4c:   0f 1f 40 00             nop    DWORD PTR [rax+0x0]
    7a50:   8b 45 a8                mov    eax,DWORD PTR [rbp-0x58]
    7a53:   41 c7 00 00 00 00 00    mov    DWORD PTR [r8],0x0
    7a5a:   45 31 db                xor    r11d,r11d
    7a5d:   48 8b 5d a0             mov    rbx,QWORD PTR [rbp-0x60]
    7a61:   44 8b 4d ac             mov    r9d,DWORD PTR [rbp-0x54]
    7a65:   85 c0                   test   eax,eax
    7a67:   0f 8e 98 00 00 00       jle    7b05 <convolution2D+0x285>
    7a6d:   0f 1f 00                nop    DWORD PTR [rax]
    7a70:   45 85 c9                test   r9d,r9d
    7a73:   78 7b                   js     7af0 <convolution2D+0x270>
    7a75:   45 39 ce                cmp    r14d,r9d
    7a78:   7e 76                   jle    7af0 <convolution2D+0x270>
    7a7a:   48 8b 45 b8             mov    rax,QWORD PTR [rbp-0x48]
    7a7e:   8b 55 d0                mov    edx,DWORD PTR [rbp-0x30]
    7a81:   48 89 de                mov    rsi,rbx
    7a84:   4a 8d 3c 98             lea    rdi,[rax+r11*4]
    7a88:   eb 13                   jmp    7a9d <convolution2D+0x21d>
    7a8a:   66 0f 1f 44 00 00       nop    WORD PTR [rax+rax*1+0x0]
    7a90:   ff c2                   inc    edx
    7a92:   4c 01 e7                add    rdi,r12
    7a95:   4c 01 e6                add    rsi,r12
    7a98:   44 39 d2                cmp    edx,r10d
    7a9b:   74 53                   je     7af0 <convolution2D+0x270>
    7a9d:   85 d2                   test   edx,edx
    7a9f:   78 ef                   js     7a90 <convolution2D+0x210>
    7aa1:   41 39 d6                cmp    r14d,edx
    7aa4:   7e ea                   jle    7a90 <convolution2D+0x210>
    7aa6:   45 85 ed                test   r13d,r13d
    7aa9:   7e e5                   jle    7a90 <convolution2D+0x210>
    7aab:   c4 c1 7a 10 08          vmovss xmm1,DWORD PTR [r8]
    7ab0:   31 c0                   xor    eax,eax
    7ab2:   66 66 2e 0f 1f 84 00 00 00 00 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
    7abd:   0f 1f 00                nop    DWORD PTR [rax]
    7ac0:   c5 fa 10 04 87          vmovss xmm0,DWORD PTR [rdi+rax*4]
    7ac5:   48 89 c1                mov    rcx,rax
    7ac8:   c5 fa 59 04 86          vmulss xmm0,xmm0,DWORD PTR [rsi+rax*4]
    7acd:   48 ff c0                inc    rax
    7ad0:   c5 f2 58 c8             vaddss xmm1,xmm1,xmm0
    7ad4:   c4 c1 7a 11 08          vmovss DWORD PTR [r8],xmm1
    7ad9:   49 39 cf                cmp    r15,rcx
    7adc:   75 e2                   jne    7ac0 <convolution2D+0x240>
    7ade:   ff c2                   inc    edx
    7ae0:   4c 01 e7                add    rdi,r12
    7ae3:   4c 01 e6                add    rsi,r12
    7ae6:   44 39 d2                cmp    edx,r10d
    7ae9:   75 b2                   jne    7a9d <convolution2D+0x21d>
    7aeb:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
    7af0:   4c 03 5d c8             add    r11,QWORD PTR [rbp-0x38]
    7af4:   48 03 5d c0             add    rbx,QWORD PTR [rbp-0x40]
    7af8:   41 ff c1                inc    r9d
    7afb:   44 3b 4d d4             cmp    r9d,DWORD PTR [rbp-0x2c]
    7aff:   0f 85 6b ff ff ff       jne    7a70 <convolution2D+0x1f0>
    7b05:   48 8b 5d 98             mov    rbx,QWORD PTR [rbp-0x68]
    7b09:   49 83 c0 04             add    r8,0x4
    7b0d:   48 01 5d b8             add    QWORD PTR [rbp-0x48],rbx
    7b11:   4c 3b 45 b0             cmp    r8,QWORD PTR [rbp-0x50]
    7b15:   0f 85 35 ff ff ff       jne    7a50 <convolution2D+0x1d0>
    7b1b:   8b 9d 74 ff ff ff       mov    ebx,DWORD PTR [rbp-0x8c]
    7b21:   8b 45 94                mov    eax,DWORD PTR [rbp-0x6c]
    7b24:   48 8b 8d 48 ff ff ff    mov    rcx,QWORD PTR [rbp-0xb8]
    7b2b:   01 5d d0                add    DWORD PTR [rbp-0x30],ebx
    7b2e:   48 01 4d b0             add    QWORD PTR [rbp-0x50],rcx
    7b32:   01 5d 84                add    DWORD PTR [rbp-0x7c],ebx
    7b35:   48 8b 8d 58 ff ff ff    mov    rcx,QWORD PTR [rbp-0xa8]
    7b3c:   41 01 da                add    r10d,ebx
    7b3f:   48 01 8d 78 ff ff ff    add    QWORD PTR [rbp-0x88],rcx
    7b46:   ff c0                   inc    eax
    7b48:   39 85 38 ff ff ff       cmp    DWORD PTR [rbp-0xc8],eax
    7b4e:   74 08                   je     7b58 <convolution2D+0x2d8>
    7b50:   89 45 94                mov    DWORD PTR [rbp-0x6c],eax
    7b53:   e9 ac fe ff ff          jmp    7a04 <convolution2D+0x184>
    7b58:   8b 4d 90                mov    ecx,DWORD PTR [rbp-0x70]
    7b5b:   48 8b b5 30 ff ff ff    mov    rsi,QWORD PTR [rbp-0xd0]
    7b62:   01 5d d4                add    DWORD PTR [rbp-0x2c],ebx
    7b65:   01 5d ac                add    DWORD PTR [rbp-0x54],ebx
    7b68:   01 5d 80                add    DWORD PTR [rbp-0x80],ebx
    7b6b:   48 01 75 88             add    QWORD PTR [rbp-0x78],rsi
    7b6f:   8d 41 01                lea    eax,[rcx+0x1]
    7b72:   39 4d 94                cmp    DWORD PTR [rbp-0x6c],ecx
    7b75:   74 08                   je     7b7f <convolution2D+0x2ff>
    7b77:   89 45 90                mov    DWORD PTR [rbp-0x70],eax
    7b7a:   e9 33 fe ff ff          jmp    79b2 <convolution2D+0x132>
    7b7f:   48 81 c4 b0 00 00 00    add    rsp,0xb0
    7b86:   5b                      pop    rbx
    7b87:   41 5c                   pop    r12
    7b89:   41 5d                   pop    r13
    7b8b:   41 5e                   pop    r14
    7b8d:   41 5f                   pop    r15
    7b8f:   5d                      pop    rbp
    7b90:   c3                      ret    
    7b91:   66 66 2e 0f 1f 84 00 00 00 00 00    data16 nop WORD PTR cs:[rax+rax*1+0x0]
    7b9c:   0f 1f 40 00             nop    DWORD PTR [rax+0x0]

For reference, I'm using an AMD Ryzen 7 CPU which uses Zen2 architecture. Here is its list of instructions (page 101).

I suspect that the data here points to a memory issue instead of simply the multiplication being the cause of the bottleneck.

Question:

How can I improve this code so that it does not cause a memory bottleneck?

I'm guessing this is actually a problem particular to my code, perhaps something related to the multidimensional arrays I'm using. If I instead used one big single-dimentional array for each variable, would the latency decrease?

Relevant information:

There are two ways I declare the variables that are passed to this function. The first is as a global variable (usually in a struct), the second is as dynamic allocation:

float (*arr)[x][y] = calloc(z, sizeof *arr);

Perhaps the order in which I declare these matrixes is not cache-friendly, but I am not sure how to re-order it.

Stride values for the previous function are always 1 or 2, usually 1.

Here is the output of valgrind --tool=cachegrind:

==430300== Cachegrind, a cache and branch-prediction profiler
==430300== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==430300== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==430300== Command: ./EmbeddedNet test 1
==430300== Parent PID: 170008
==430300== 
--430300-- warning: L3 cache found, using its data for the LL simulation.
==430300== 
==430300== I   refs:      6,369,594,192
==430300== I1  misses:            4,271
==430300== LLi misses:            2,442
==430300== I1  miss rate:          0.00%
==430300== LLi miss rate:          0.00%
==430300== 
==430300== D   refs:      2,064,233,110  (1,359,003,131 rd   + 705,229,979 wr)
==430300== D1  misses:       34,476,969  (   19,010,839 rd   +  15,466,130 wr)
==430300== LLd misses:        5,311,277  (    1,603,955 rd   +   3,707,322 wr)
==430300== D1  miss rate:           1.7% (          1.4%     +         2.2%  )
==430300== LLd miss rate:           0.3% (          0.1%     +         0.5%  )
==430300== 
==430300== LL refs:          34,481,240  (   19,015,110 rd   +  15,466,130 wr)
==430300== LL misses:         5,313,719  (    1,606,397 rd   +   3,707,322 wr)
==430300== LL miss rate:            0.1% (          0.0%     +         0.5%  )

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

冷月断魂刀 2025-01-19 15:18:46

从 Cachegrind 的结果来看，内存似乎不是您的瓶颈。无论如何，NN 必须存储在内存中，但如果它太大以至于您的程序有很多 L1 缓存未命中，那么值得考虑尝试尽量减少 L1 未命中，但 1.7% 的 L1（数据）未命中率并不是一个问题。问题。

所以无论如何你都在努力让这个运行得更快。查看您的代码，最内部循环发生的事情非常简单（加载 - >乘法 - >添加 - >存储），并且除了最终存储之外，它没有任何副作用。这种代码很容易并行化，例如通过多线程或向量化。我认为您会知道如何在多个线程中运行它，因为您可以编写具有一定复杂性的代码，并且您在评论中询问如何手动矢量化代码。

我将解释这一部分，但要记住的一件事是，一旦您选择手动对代码进行向量化，它通常会与某些 CPU 架构绑定。我们不要考虑像 ARM 这样的非 AMD64 兼容 CPU。尽管如此，您仍然可以选择 MMX、SSE、AVX 和 AVX512 作为矢量化计算的扩展，并且每个扩展都有多个版本。如果您想要最大的可移植性，SSE2 是一个合理的选择。 SSE2 随 Pentium 4 一起出现，支持 128 位向量。在这篇文章中，我将使用 AVX2，它支持 128 位和 256 位向量。它在您的 CPU 上运行良好，并且目前具有合理的可移植性，并受到 Haswell (2013) 和 Excavator (2015) 的支持。

您在内循环中使用的模式称为 FMA（融合乘法和加法）。 AVX2 有这方面的说明。看一下这个函数和编译的输出。

float fma_scl(float a, float b, float c) {
    return a * b + c;
}

fma_scl:
        vfmadd132ss     xmm0, xmm2, xmm1
        ret

您可以看到使用一条指令完成的计算。

我们将使用 GCC 的向量扩展定义 256 位向量类型。

typedef float Vec __attribute__((vector_size(32), aligned(32)));

这是一个矢量化的 fma 函数。

Vec fma_vec(Vec a, Vec b, Vec c) {
    return a * b + c;
}

fma_vec:
        vfmadd132ps     ymm0, ymm2, ymm1
        ret

上面的代码在语义上与下面的代码相同，但一切都是在一条指令中完成的。

typedef struct {
    float f[8];
} Vec_;

Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
    Vec_ r;
    for (unsigned i = 0; i < 8; ++i) {
        r.f[i] = a.f[i] * b.f[i] + c.f[i];
    }
    return r;
}

我想您现在应该明白通过矢量化使代码运行得更快。

这是一个与内部循环有些相似的简单函数。

void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
    for (unsigned i = 0; i < n; ++i) {
        a[i] = fma_scl(b[i], c[i], a[i]);
    }
}

当您使用 -O3 -march=znver2 通过 GCC 进行编译时，这就是输出。它是巨大的。下面我会解释一下。

loopadd_scl:
        test    ecx, ecx
        je      .L25
        lea     eax, [rcx-1]
        cmp     eax, 6
        jbe     .L13
        mov     r8d, ecx
        xor     eax, eax
        shr     r8d, 3
        sal     r8, 5
.L9:
        vmovups ymm1, YMMWORD PTR [rdi+rax]
        vmovups ymm0, YMMWORD PTR [rdx+rax]
        vfmadd132ps     ymm0, ymm1, YMMWORD PTR [rsi+rax]
        vmovups YMMWORD PTR [rdi+rax], ymm0
        add     rax, 32
        cmp     r8, rax
        jne     .L9
        mov     eax, ecx
        and     eax, -8
        test    cl, 7
        je      .L26
        vzeroupper
.L8:
        mov     r9d, ecx
        sub     r9d, eax
        lea     r8d, [r9-1]
        cmp     r8d, 2
        jbe     .L11
        mov     r8d, eax
        sal     r8, 2
        lea     r10, [rdi+r8]
        vmovups xmm0, XMMWORD PTR [rdx+r8]
        vmovups xmm2, XMMWORD PTR [r10]
        vfmadd132ps     xmm0, xmm2, XMMWORD PTR [rsi+r8]
        mov     r8d, r9d
        and     r8d, -4
        add     eax, r8d
        and     r9d, 3
        vmovups XMMWORD PTR [r10], xmm0
        je      .L25
.L11:
        mov     r8d, eax
        sal     r8, 2
        lea     r9, [rdi+r8]
        vmovss  xmm0, DWORD PTR [rdx+r8]
        vmovss  xmm3, DWORD PTR [r9]
        vfmadd132ss     xmm0, xmm3, DWORD PTR [rsi+r8]
        lea     r8d, [rax+1]
        vmovss  DWORD PTR [r9], xmm0
        cmp     r8d, ecx
        jnb     .L25
        sal     r8, 2
        add     eax, 2
        lea     r9, [rdi+r8]
        vmovss  xmm0, DWORD PTR [rsi+r8]
        vmovss  xmm4, DWORD PTR [r9]
        vfmadd132ss     xmm0, xmm4, DWORD PTR [rdx+r8]
        vmovss  DWORD PTR [r9], xmm0
        cmp     eax, ecx
        jnb     .L25
        sal     rax, 2
        add     rdi, rax
        vmovss  xmm0, DWORD PTR [rdx+rax]
        vmovss  xmm5, DWORD PTR [rdi]
        vfmadd132ss     xmm0, xmm5, DWORD PTR [rsi+rax]
        vmovss  DWORD PTR [rdi], xmm0
.L25:
        ret
.L26:
        vzeroupper
        ret
.L13:
        xor     eax, eax
        jmp     .L8

基本上，GCC 对 n 一无所知，因此它将循环分为 3 种情况： n / 8 > 1，1，n 4..它首先处理n / 8 > 1 部分使用 256 位 ymm 寄存器。然后，它处理n / 4 > 1 具有 128 位 xmm 寄存器。最后，它处理n << 4 带有标量 ss 指令。

如果您知道 n 是 8 的倍数，您就可以避免这种混乱。我现在有点懒，所以看一下下面的代码和编译器输出，并将其与上面的进行比较。我认为你足够聪明，能够明白这个想法。

void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
    n /= 8;
    for (unsigned i = 0; i < n; ++i) {
        a[i] = fma_vec(b[i], c[i], a[i]);
    }
}

loopadd_vec:
        shr     ecx, 3
        je      .L34
        mov     ecx, ecx
        xor     eax, eax
        sal     rcx, 5
.L29:
        vmovaps ymm1, YMMWORD PTR [rdi+rax]
        vmovaps ymm0, YMMWORD PTR [rdx+rax]
        vfmadd132ps     ymm0, ymm1, YMMWORD PTR [rsi+rax]
        vmovaps YMMWORD PTR [rdi+rax], ymm0
        add     rax, 32
        cmp     rcx, rax
        jne     .L29
        vzeroupper
.L34:
        ret
}

Looking at the result of Cachegrind, it doesn't look like the memory is your bottleneck. The NN has to be stored in memory anyway, but if it's too large that your program's having a lot of L1 cache misses, then it's worth thinking to try to minimize L1 misses, but 1.7% of L1 (data) miss rate is not a problem.

So you're trying to make this run fast anyway. Looking at your code, what's happening at the most inner loop is very simple (load-> multiply -> add -> store), and it doesn't have any side effect other than the final store. This kind of code is easily parallelizable, for example, by multithreading or vectorizing. I think you'll know how to make this run in multiple threads seeing that you can write code with some complexity, and you asked in comments how to manually vectorize the code.

I will explain that part, but one thing to bear in mind is that once you choose to manually vectorize the code, it will often be tied to certain CPU architectures. Let's not consider non-AMD64 compatible CPUs like ARM. Still, you have the option of MMX, SSE, AVX, and AVX512 to choose as an extension for vectorized computation, and each extension has multiple versions. If you want maximum portability, SSE2 is a reasonable choice. SSE2 appeared with Pentium 4, and it supports 128-bit vectors. For this post I'll use AVX2, which supports 128-bit and 256-bit vectors. It runs fine on your CPU, and has reasonable portability these days, supported from Haswell (2013) and Excavator (2015).

The pattern you're using in the inner loop is called FMA (fused multiply and add). AVX2 has an instruction for this. Have a look at this function and the compiled output.

float fma_scl(float a, float b, float c) {
    return a * b + c;
}

fma_scl:
        vfmadd132ss     xmm0, xmm2, xmm1
        ret

You can see the calculation done with a single instruction.

We'll define a 256-bit vector type using GCC's vector extension.

typedef float Vec __attribute__((vector_size(32), aligned(32)));

Here's a vectorized fma function.

Vec fma_vec(Vec a, Vec b, Vec c) {
    return a * b + c;
}

fma_vec:
        vfmadd132ps     ymm0, ymm2, ymm1
        ret

The code above is semantically the same as the one below, but everything is done in a single instruction.

typedef struct {
    float f[8];
} Vec_;

Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
    Vec_ r;
    for (unsigned i = 0; i < 8; ++i) {
        r.f[i] = a.f[i] * b.f[i] + c.f[i];
    }
    return r;
}

I think you'll now get the idea of making code run faster by vectorization.

Here is a simple function that's somewhat similar to your inner loop.

void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
    for (unsigned i = 0; i < n; ++i) {
        a[i] = fma_scl(b[i], c[i], a[i]);
    }
}

When you compile through GCC with -O3 -march=znver2, this is the output. It's huge. I'll explain below.

loopadd_scl:
        test    ecx, ecx
        je      .L25
        lea     eax, [rcx-1]
        cmp     eax, 6
        jbe     .L13
        mov     r8d, ecx
        xor     eax, eax
        shr     r8d, 3
        sal     r8, 5
.L9:
        vmovups ymm1, YMMWORD PTR [rdi+rax]
        vmovups ymm0, YMMWORD PTR [rdx+rax]
        vfmadd132ps     ymm0, ymm1, YMMWORD PTR [rsi+rax]
        vmovups YMMWORD PTR [rdi+rax], ymm0
        add     rax, 32
        cmp     r8, rax
        jne     .L9
        mov     eax, ecx
        and     eax, -8
        test    cl, 7
        je      .L26
        vzeroupper
.L8:
        mov     r9d, ecx
        sub     r9d, eax
        lea     r8d, [r9-1]
        cmp     r8d, 2
        jbe     .L11
        mov     r8d, eax
        sal     r8, 2
        lea     r10, [rdi+r8]
        vmovups xmm0, XMMWORD PTR [rdx+r8]
        vmovups xmm2, XMMWORD PTR [r10]
        vfmadd132ps     xmm0, xmm2, XMMWORD PTR [rsi+r8]
        mov     r8d, r9d
        and     r8d, -4
        add     eax, r8d
        and     r9d, 3
        vmovups XMMWORD PTR [r10], xmm0
        je      .L25
.L11:
        mov     r8d, eax
        sal     r8, 2
        lea     r9, [rdi+r8]
        vmovss  xmm0, DWORD PTR [rdx+r8]
        vmovss  xmm3, DWORD PTR [r9]
        vfmadd132ss     xmm0, xmm3, DWORD PTR [rsi+r8]
        lea     r8d, [rax+1]
        vmovss  DWORD PTR [r9], xmm0
        cmp     r8d, ecx
        jnb     .L25
        sal     r8, 2
        add     eax, 2
        lea     r9, [rdi+r8]
        vmovss  xmm0, DWORD PTR [rsi+r8]
        vmovss  xmm4, DWORD PTR [r9]
        vfmadd132ss     xmm0, xmm4, DWORD PTR [rdx+r8]
        vmovss  DWORD PTR [r9], xmm0
        cmp     eax, ecx
        jnb     .L25
        sal     rax, 2
        add     rdi, rax
        vmovss  xmm0, DWORD PTR [rdx+rax]
        vmovss  xmm5, DWORD PTR [rdi]
        vfmadd132ss     xmm0, xmm5, DWORD PTR [rsi+rax]
        vmovss  DWORD PTR [rdi], xmm0
.L25:
        ret
.L26:
        vzeroupper
        ret
.L13:
        xor     eax, eax
        jmp     .L8

Basically GCC doesn't know anything about n, so it's splitting the loop to 3 cases: n / 8 > 1, n / 4 > 1, n < 4. It first deals with the n / 8 > 1 part using 256-bit ymm registers. Then, it deals with n / 4 > 1 with 128-bit xmm registers. Finally, it deals with n < 4 with scalar ss instructions.

You can avoid this mess if you know n is a multiple of 8. I got a bit lazy now, so have a look at the code and the compiler output below and compare it with the above. I think you're smart enough to get the idea.

void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
    n /= 8;
    for (unsigned i = 0; i < n; ++i) {
        a[i] = fma_vec(b[i], c[i], a[i]);
    }
}

loopadd_vec:
        shr     ecx, 3
        je      .L34
        mov     ecx, ecx
        xor     eax, eax
        sal     rcx, 5
.L29:
        vmovaps ymm1, YMMWORD PTR [rdi+rax]
        vmovaps ymm0, YMMWORD PTR [rdx+rax]
        vfmadd132ps     ymm0, ymm1, YMMWORD PTR [rsi+rax]
        vmovaps YMMWORD PTR [rdi+rax], ymm0
        add     rax, 32
        cmp     rcx, rax
        jne     .L29
        vzeroupper
.L34:
        ret
}

回复收藏 0 原文

~没有更多了~