CUDA，同样的工作有更多线程 = 尽管占用率更高，但运行时间更长，为什么？

发布于 2024-08-25 09:22:09 字数 7528 浏览 4 评论 0原文

我遇到了一个奇怪的问题，通过增加线程数来增加占用率会降低性能。

我创建了以下程序来说明问题：

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>

__global__ void less_threads(float * d_out) {
    int num_inliers;
    for (int j=0;j<800;++j) {
        //Do 12 computations
        num_inliers += j*(j+1);
        num_inliers += j*(j+2);
        num_inliers += j*(j+3);
        num_inliers += j*(j+4);
        num_inliers += j*(j+5);
        num_inliers += j*(j+6);
        num_inliers += j*(j+7);
        num_inliers += j*(j+8);
        num_inliers += j*(j+9);
        num_inliers += j*(j+10);
        num_inliers += j*(j+11);
        num_inliers += j*(j+12);
    }

    if (threadIdx.x == -1)
        d_out[threadIdx.x] = num_inliers;
}

__global__ void more_threads(float *d_out) {
    int num_inliers;
    for (int j=0;j<800;++j) {
        // Do 4 computations
        num_inliers += j*(j+1);
        num_inliers += j*(j+2);
        num_inliers += j*(j+3);
        num_inliers += j*(j+4);
    }
    if (threadIdx.x == -1)
        d_out[threadIdx.x] = num_inliers;
}


int main(int argc, char* argv[])
{
    float *d_out = NULL;
    cudaMalloc((void**)&d_out,sizeof(float)*25000);

    more_threads<<<780,128>>>(d_out);
    less_threads<<<780,32>>>(d_out);


    return 0;
}

PTX 输出为：

    .entry _Z12less_threadsPf (
        .param .u32 __cudaparm__Z12less_threadsPf_d_out)
    {
    .reg .u32 %r<35>;
    .reg .f32 %f<3>;
    .reg .pred %p<4>;
    .loc    17  6   0
 //   2  #include <stdlib.h>
 //   3  #include <cuda_runtime.h>
 //   4  #include <cutil.h>
 //   5  
 //   6  __global__ void less_threads(float * d_out) {
$LBB1__Z12less_threadsPf:
    mov.s32     %r1, 0;
    mov.s32     %r2, 0;
    mov.s32     %r3, 0;
    mov.s32     %r4, 0;
    mov.s32     %r5, 0;
    mov.s32     %r6, 0;
    mov.s32     %r7, 0;
    mov.s32     %r8, 0;
    mov.s32     %r9, 0;
    mov.s32     %r10, 0;
    mov.s32     %r11, 0;
    mov.s32     %r12, %r13;
    mov.s32     %r14, 0;
$Lt_0_2562:
 //<loop> Loop body line 6, nesting depth: 1, iterations: 800
    .loc    17  10  0
 //   7     int num_inliers;
 //   8     for (int j=0;j<800;++j) {
 //   9         //Do 12 computations
 //  10         num_inliers += j*(j+1);
    mul.lo.s32  %r15, %r14, %r14;
    add.s32     %r16, %r12, %r14;
    add.s32     %r12, %r15, %r16;
    .loc    17  11  0
 //  11         num_inliers += j*(j+2);
    add.s32     %r17, %r15, %r12;
    add.s32     %r12, %r1, %r17;
    .loc    17  12  0
 //  12         num_inliers += j*(j+3);
    add.s32     %r18, %r15, %r12;
    add.s32     %r12, %r2, %r18;
    .loc    17  13  0
 //  13         num_inliers += j*(j+4);
    add.s32     %r19, %r15, %r12;
    add.s32     %r12, %r3, %r19;
    .loc    17  14  0
 //  14         num_inliers += j*(j+5);
    add.s32     %r20, %r15, %r12;
    add.s32     %r12, %r4, %r20;
    .loc    17  15  0
 //  15         num_inliers += j*(j+6);
    add.s32     %r21, %r15, %r12;
    add.s32     %r12, %r5, %r21;
    .loc    17  16  0
 //  16         num_inliers += j*(j+7);
    add.s32     %r22, %r15, %r12;
    add.s32     %r12, %r6, %r22;
    .loc    17  17  0
 //  17         num_inliers += j*(j+8);
    add.s32     %r23, %r15, %r12;
    add.s32     %r12, %r7, %r23;
    .loc    17  18  0
 //  18         num_inliers += j*(j+9);
    add.s32     %r24, %r15, %r12;
    add.s32     %r12, %r8, %r24;
    .loc    17  19  0
 //  19         num_inliers += j*(j+10);
    add.s32     %r25, %r15, %r12;
    add.s32     %r12, %r9, %r25;
    .loc    17  20  0
 //  20         num_inliers += j*(j+11);
    add.s32     %r26, %r15, %r12;
    add.s32     %r12, %r10, %r26;
    .loc    17  21  0
 //  21         num_inliers += j*(j+12);
    add.s32     %r27, %r15, %r12;
    add.s32     %r12, %r11, %r27;
    add.s32     %r14, %r14, 1;
    add.s32     %r11, %r11, 12;
    add.s32     %r10, %r10, 11;
    add.s32     %r9, %r9, 10;
    add.s32     %r8, %r8, 9;
    add.s32     %r7, %r7, 8;
    add.s32     %r6, %r6, 7;
    add.s32     %r5, %r5, 6;
    add.s32     %r4, %r4, 5;
    add.s32     %r3, %r3, 4;
    add.s32     %r2, %r2, 3;
    add.s32     %r1, %r1, 2;
    mov.u32     %r28, 1600;
    setp.ne.s32     %p1, %r1, %r28;
    @%p1 bra    $Lt_0_2562;
    cvt.u32.u16     %r29, %tid.x;
    mov.u32     %r30, -1;
    setp.ne.u32     %p2, %r29, %r30;
    @%p2 bra    $Lt_0_3074;
    .loc    17  25  0
 //  22     }
 //  23  
 //  24     if (threadIdx.x == -1)
 //  25         d_out[threadIdx.x] = num_inliers;
    cvt.rn.f32.s32  %f1, %r12;
    ld.param.u32    %r31, [__cudaparm__Z12less_threadsPf_d_out];
    mul24.lo.u32    %r32, %r29, 4;
    add.u32     %r33, %r31, %r32;
    st.global.f32   [%r33+0], %f1;
$Lt_0_3074:
    .loc    17  26  0
 //  26  }
    exit;
$LDWend__Z12less_threadsPf:
    } // _Z12less_threadsPf

    .entry _Z12more_threadsPf (
        .param .u32 __cudaparm__Z12more_threadsPf_d_out)
    {
    .reg .u32 %r<19>;
    .reg .f32 %f<3>;
    .reg .pred %p<4>;
    .loc    17  28  0
 //  27  
 //  28  __global__ void more_threads(float *d_out) {
$LBB1__Z12more_threadsPf:
    mov.s32     %r1, 0;
    mov.s32     %r2, 0;
    mov.s32     %r3, 0;
    mov.s32     %r4, %r5;
    mov.s32     %r6, 0;
$Lt_1_2562:
 //<loop> Loop body line 28, nesting depth: 1, iterations: 800
    .loc    17  32  0
 //  29     int num_inliers;
 //  30     for (int j=0;j<800;++j) {
 //  31         // Do 4 computations
 //  32         num_inliers += j*(j+1);
    mul.lo.s32  %r7, %r6, %r6;
    add.s32     %r8, %r4, %r6;
    add.s32     %r4, %r7, %r8;
    .loc    17  33  0
 //  33         num_inliers += j*(j+2);
    add.s32     %r9, %r7, %r4;
    add.s32     %r4, %r1, %r9;
    .loc    17  34  0
 //  34         num_inliers += j*(j+3);
    add.s32     %r10, %r7, %r4;
    add.s32     %r4, %r2, %r10;
    .loc    17  35  0
 //  35         num_inliers += j*(j+4);
    add.s32     %r11, %r7, %r4;
    add.s32     %r4, %r3, %r11;
    add.s32     %r6, %r6, 1;
    add.s32     %r3, %r3, 4;
    add.s32     %r2, %r2, 3;
    add.s32     %r1, %r1, 2;
    mov.u32     %r12, 1600;
    setp.ne.s32     %p1, %r1, %r12;
    @%p1 bra    $Lt_1_2562;
    cvt.u32.u16     %r13, %tid.x;
    mov.u32     %r14, -1;
    setp.ne.u32     %p2, %r13, %r14;
    @%p2 bra    $Lt_1_3074;
    .loc    17  38  0
 //  36     }
 //  37     if (threadIdx.x == -1)
 //  38         d_out[threadIdx.x] = num_inliers;
    cvt.rn.f32.s32  %f1, %r4;
    ld.param.u32    %r15, [__cudaparm__Z12more_threadsPf_d_out];
    mul24.lo.u32    %r16, %r13, 4;
    add.u32     %r17, %r15, %r16;
    st.global.f32   [%r17+0], %f1;
$Lt_1_3074:
    .loc    17  39  0
 //  39  }
    exit;
$LDWend__Z12more_threadsPf:
    } // _Z12more_threadsPf

请注意，两个内核总共应该执行相同的工作量， (if threadIdx.x == -1 是一个阻止编译器优化所有内容的技巧，并且留下一个空内核）。工作应该与 more_threads 相同，使用 4 倍的线程，但每个线程执行的工作量减少 4 倍。

分析器结果的重要结果如下L：

more_threads：GPU运行时间= 1474 us，reg每个线程= 6，占用= 1，分支= 83746，divergent_branch = 26，指令= 584065，gst请求= 1084552

less_threads：GPU运行时间= 921 us,reg per thread = 14,ocupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381

正如我之前所说，使用更多线程的内核的运行时间更长，这可能是由于指令数量增加。

为什么有更多的指令？

为什么会有分支，考虑到没有条件代码，更不用说发散分支了？

在没有全局内存访问的情况下，为什么会有 gst 请求？

这是怎么回事！

感谢

更新

添加了 PTX 代码并修复了 CUDA C，因此它应该可以编译

原文

I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.

I created the following program to illustrate the problem:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>

__global__ void less_threads(float * d_out) {
    int num_inliers;
    for (int j=0;j<800;++j) {
        //Do 12 computations
        num_inliers += j*(j+1);
        num_inliers += j*(j+2);
        num_inliers += j*(j+3);
        num_inliers += j*(j+4);
        num_inliers += j*(j+5);
        num_inliers += j*(j+6);
        num_inliers += j*(j+7);
        num_inliers += j*(j+8);
        num_inliers += j*(j+9);
        num_inliers += j*(j+10);
        num_inliers += j*(j+11);
        num_inliers += j*(j+12);
    }

    if (threadIdx.x == -1)
        d_out[threadIdx.x] = num_inliers;
}

__global__ void more_threads(float *d_out) {
    int num_inliers;
    for (int j=0;j<800;++j) {
        // Do 4 computations
        num_inliers += j*(j+1);
        num_inliers += j*(j+2);
        num_inliers += j*(j+3);
        num_inliers += j*(j+4);
    }
    if (threadIdx.x == -1)
        d_out[threadIdx.x] = num_inliers;
}


int main(int argc, char* argv[])
{
    float *d_out = NULL;
    cudaMalloc((void**)&d_out,sizeof(float)*25000);

    more_threads<<<780,128>>>(d_out);
    less_threads<<<780,32>>>(d_out);


    return 0;
}

And the PTX output is:

    .entry _Z12less_threadsPf (
        .param .u32 __cudaparm__Z12less_threadsPf_d_out)
    {
    .reg .u32 %r<35>;
    .reg .f32 %f<3>;
    .reg .pred %p<4>;
    .loc    17  6   0
 //   2  #include <stdlib.h>
 //   3  #include <cuda_runtime.h>
 //   4  #include <cutil.h>
 //   5  
 //   6  __global__ void less_threads(float * d_out) {
$LBB1__Z12less_threadsPf:
    mov.s32     %r1, 0;
    mov.s32     %r2, 0;
    mov.s32     %r3, 0;
    mov.s32     %r4, 0;
    mov.s32     %r5, 0;
    mov.s32     %r6, 0;
    mov.s32     %r7, 0;
    mov.s32     %r8, 0;
    mov.s32     %r9, 0;
    mov.s32     %r10, 0;
    mov.s32     %r11, 0;
    mov.s32     %r12, %r13;
    mov.s32     %r14, 0;
$Lt_0_2562:
 //<loop> Loop body line 6, nesting depth: 1, iterations: 800
    .loc    17  10  0
 //   7     int num_inliers;
 //   8     for (int j=0;j<800;++j) {
 //   9         //Do 12 computations
 //  10         num_inliers += j*(j+1);
    mul.lo.s32  %r15, %r14, %r14;
    add.s32     %r16, %r12, %r14;
    add.s32     %r12, %r15, %r16;
    .loc    17  11  0
 //  11         num_inliers += j*(j+2);
    add.s32     %r17, %r15, %r12;
    add.s32     %r12, %r1, %r17;
    .loc    17  12  0
 //  12         num_inliers += j*(j+3);
    add.s32     %r18, %r15, %r12;
    add.s32     %r12, %r2, %r18;
    .loc    17  13  0
 //  13         num_inliers += j*(j+4);
    add.s32     %r19, %r15, %r12;
    add.s32     %r12, %r3, %r19;
    .loc    17  14  0
 //  14         num_inliers += j*(j+5);
    add.s32     %r20, %r15, %r12;
    add.s32     %r12, %r4, %r20;
    .loc    17  15  0
 //  15         num_inliers += j*(j+6);
    add.s32     %r21, %r15, %r12;
    add.s32     %r12, %r5, %r21;
    .loc    17  16  0
 //  16         num_inliers += j*(j+7);
    add.s32     %r22, %r15, %r12;
    add.s32     %r12, %r6, %r22;
    .loc    17  17  0
 //  17         num_inliers += j*(j+8);
    add.s32     %r23, %r15, %r12;
    add.s32     %r12, %r7, %r23;
    .loc    17  18  0
 //  18         num_inliers += j*(j+9);
    add.s32     %r24, %r15, %r12;
    add.s32     %r12, %r8, %r24;
    .loc    17  19  0
 //  19         num_inliers += j*(j+10);
    add.s32     %r25, %r15, %r12;
    add.s32     %r12, %r9, %r25;
    .loc    17  20  0
 //  20         num_inliers += j*(j+11);
    add.s32     %r26, %r15, %r12;
    add.s32     %r12, %r10, %r26;
    .loc    17  21  0
 //  21         num_inliers += j*(j+12);
    add.s32     %r27, %r15, %r12;
    add.s32     %r12, %r11, %r27;
    add.s32     %r14, %r14, 1;
    add.s32     %r11, %r11, 12;
    add.s32     %r10, %r10, 11;
    add.s32     %r9, %r9, 10;
    add.s32     %r8, %r8, 9;
    add.s32     %r7, %r7, 8;
    add.s32     %r6, %r6, 7;
    add.s32     %r5, %r5, 6;
    add.s32     %r4, %r4, 5;
    add.s32     %r3, %r3, 4;
    add.s32     %r2, %r2, 3;
    add.s32     %r1, %r1, 2;
    mov.u32     %r28, 1600;
    setp.ne.s32     %p1, %r1, %r28;
    @%p1 bra    $Lt_0_2562;
    cvt.u32.u16     %r29, %tid.x;
    mov.u32     %r30, -1;
    setp.ne.u32     %p2, %r29, %r30;
    @%p2 bra    $Lt_0_3074;
    .loc    17  25  0
 //  22     }
 //  23  
 //  24     if (threadIdx.x == -1)
 //  25         d_out[threadIdx.x] = num_inliers;
    cvt.rn.f32.s32  %f1, %r12;
    ld.param.u32    %r31, [__cudaparm__Z12less_threadsPf_d_out];
    mul24.lo.u32    %r32, %r29, 4;
    add.u32     %r33, %r31, %r32;
    st.global.f32   [%r33+0], %f1;
$Lt_0_3074:
    .loc    17  26  0
 //  26  }
    exit;
$LDWend__Z12less_threadsPf:
    } // _Z12less_threadsPf

    .entry _Z12more_threadsPf (
        .param .u32 __cudaparm__Z12more_threadsPf_d_out)
    {
    .reg .u32 %r<19>;
    .reg .f32 %f<3>;
    .reg .pred %p<4>;
    .loc    17  28  0
 //  27  
 //  28  __global__ void more_threads(float *d_out) {
$LBB1__Z12more_threadsPf:
    mov.s32     %r1, 0;
    mov.s32     %r2, 0;
    mov.s32     %r3, 0;
    mov.s32     %r4, %r5;
    mov.s32     %r6, 0;
$Lt_1_2562:
 //<loop> Loop body line 28, nesting depth: 1, iterations: 800
    .loc    17  32  0
 //  29     int num_inliers;
 //  30     for (int j=0;j<800;++j) {
 //  31         // Do 4 computations
 //  32         num_inliers += j*(j+1);
    mul.lo.s32  %r7, %r6, %r6;
    add.s32     %r8, %r4, %r6;
    add.s32     %r4, %r7, %r8;
    .loc    17  33  0
 //  33         num_inliers += j*(j+2);
    add.s32     %r9, %r7, %r4;
    add.s32     %r4, %r1, %r9;
    .loc    17  34  0
 //  34         num_inliers += j*(j+3);
    add.s32     %r10, %r7, %r4;
    add.s32     %r4, %r2, %r10;
    .loc    17  35  0
 //  35         num_inliers += j*(j+4);
    add.s32     %r11, %r7, %r4;
    add.s32     %r4, %r3, %r11;
    add.s32     %r6, %r6, 1;
    add.s32     %r3, %r3, 4;
    add.s32     %r2, %r2, 3;
    add.s32     %r1, %r1, 2;
    mov.u32     %r12, 1600;
    setp.ne.s32     %p1, %r1, %r12;
    @%p1 bra    $Lt_1_2562;
    cvt.u32.u16     %r13, %tid.x;
    mov.u32     %r14, -1;
    setp.ne.u32     %p2, %r13, %r14;
    @%p2 bra    $Lt_1_3074;
    .loc    17  38  0
 //  36     }
 //  37     if (threadIdx.x == -1)
 //  38         d_out[threadIdx.x] = num_inliers;
    cvt.rn.f32.s32  %f1, %r4;
    ld.param.u32    %r15, [__cudaparm__Z12more_threadsPf_d_out];
    mul24.lo.u32    %r16, %r13, 4;
    add.u32     %r17, %r15, %r16;
    st.global.f32   [%r17+0], %f1;
$Lt_1_3074:
    .loc    17  39  0
 //  39  }
    exit;
$LDWend__Z12more_threadsPf:
    } // _Z12more_threadsPf

Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.

Significant results form the profiler results are as followsL:

more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552

less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381

As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.

Why are there more instructions?

Why is there any branching, let alone divergent branching, considering there is no conditional code?

Why are there any gst requests when there is no global memory access?

What is going on here!

Thanks

Update

Added PTX code and fixed CUDA C so it should compile

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

若相惜即相离 2024-09-01 09:22:09

这两个函数所做的工作量不同。

more_threads<<<780, 128>>>():

780 个块
每块 128 个线程每个循环
4 个乘法每个循环
8 个加法
780*128*800*(4+8) = 958,464,000 次浮点运算

less_threads<<<780, 32>>():

780 个块
每块 32 个线程每个循环
12 个乘法每个循环
24 个加法
780*32*800*(12+24 ) = 718,848,000 flops

因此，more_threads 比更少的线程执行更多的工作，这就是指令数量增加以及 more_threads 速度更慢的原因。要修复 more_threads，只需在循环内执行 3 次计算：780*128*800*(3+6) = 718,848,000。

回复收藏 0 原文

握住你手 2024-09-01 09:22:09

由于您的代码只有算术指令，因此您不需要非常高的占用率来隐藏算术单元的延迟。事实上，即使您确实有内存指令，只要您的读/写效率很高，您也可以通过约 50% 的占用率来最大限度地提高性能。有关占用率和性能的更多信息，请参阅录制的高级 CUDA C 演示文稿。

在您的情况下，鉴于您的内核不需要高占用率来使算术单元饱和，因此使用较少的较大块会比使用更多较小的块获得更好的性能，因为启动块是有成本的。然而，一般来说，与实际运行代码的时间相比，启动块的成本可以忽略不计。

为什么有更多指令？

请记住，计数器不是按块（又名 CTA）计数，而是按 SM（流式多处理器）或 TPC（纹理处理集群）（两个或三个一组）进行计数SMS 取决于您的设备。指令数按 SM 计算。

期望 less_threads 内核拥有更少的指令是公平的，但是每个块启动的扭曲数量是原来的四倍，这意味着每个 SM 执行代码的次数大约是原来的四倍。考虑到较短的内核代码，您的测量似乎并非不合理。

为什么会有分支？

实际上你确实有条件代码：

for (int j=0;j<800;++j)

这有一个条件，但是扭曲中的所有线程确实都在执行相同的路径，因此它不会发散。我的猜测是管理代码中存在差异，如果您担心的话，可以查看 PTX 代码来分析这一点。 26 与执行的指令数相比非常低，因此这不会影响您的性能。

为什么会有任何 GST 请求？

在您的代码中，您有：

if (threadIdx.x == -1)
  d_out[blockIdx.x*blockDim.x+threadIdx.x] = num_inliers;

这将由加载/存储单元处理，因此即使它不会导致实际交易，也会被计数。 gst_32/gst_64/gst_128 计数器指示实际内存传输（您的设备具有计算能力 1.2 或 1.3，较旧的设备具有不同的计数器集）。

Since your code has only arithmetic instructions, you don't need very high occupancy to hide the latency of the arithmetic units. Indeed, even if you do have memory instructions you can maximise performance with ~50% occupancy provided your reads/writes are efficient. See the recorded Advanced CUDA C presentation for more information on occupancy and performance.

In your case, given that your kernel doesn't need high occupancy to saturate the arithmetic units, you will have better performance using fewer larger blocks than more smaller blocks since there is a cost for launching blocks. In general however the cost of launching blocks is negligible compared with the time to actually run the code.

Why are there more instructions?

Remember that the counters are not counting per block (aka CTA) but instead per SM (Streaming Multiprocessor) or per TPC (Texture Processing Cluster) which is a group of two or three SMs depending on your device. The instructions count is per SM.

It is fair to expect the less_threads kernel to have fewer instructions, however you are launching four times as many warps per block which means each SM will execute the code approximately four times as many times. Taking into account the shorter kernel code, your measurement doesn't seem unreasonable.

Why is there any branching?

Actually you do have conditional code:

for (int j=0;j<800;++j)

This has a condition, however all threads within a warp are indeed executing the same path so it is not divergent. My guess is the divergence is in the administration code somewhere, you could take a look at the PTX code to analyse this if you were worried. 26 is very low compared with the number of instructions executed, so this will not affect your performance.

Why are there any gst requests?

In your code you have:

if (threadIdx.x == -1)
  d_out[blockIdx.x*blockDim.x+threadIdx.x] = num_inliers;

This will be handled by the load/store unit and hence counted even though it results in no actual transaction. The gst_32/gst_64/gst_128 counters indicate actual memory transfers (your device has compute capability 1.2 or 1.3, older devices have different sets of counters).

回复收藏 0 原文