CUDA,同样的工作有更多线程 = 尽管占用率更高,但运行时间更长,为什么?
我遇到了一个奇怪的问题,通过增加线程数来增加占用率会降低性能。
我创建了以下程序来说明问题:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
__global__ void less_threads(float * d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
//Do 12 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
num_inliers += j*(j+5);
num_inliers += j*(j+6);
num_inliers += j*(j+7);
num_inliers += j*(j+8);
num_inliers += j*(j+9);
num_inliers += j*(j+10);
num_inliers += j*(j+11);
num_inliers += j*(j+12);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
__global__ void more_threads(float *d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
// Do 4 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
int main(int argc, char* argv[])
{
float *d_out = NULL;
cudaMalloc((void**)&d_out,sizeof(float)*25000);
more_threads<<<780,128>>>(d_out);
less_threads<<<780,32>>>(d_out);
return 0;
}
PTX 输出为:
.entry _Z12less_threadsPf (
.param .u32 __cudaparm__Z12less_threadsPf_d_out)
{
.reg .u32 %r<35>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 6 0
// 2 #include <stdlib.h>
// 3 #include <cuda_runtime.h>
// 4 #include <cutil.h>
// 5
// 6 __global__ void less_threads(float * d_out) {
$LBB1__Z12less_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, 0;
mov.s32 %r5, 0;
mov.s32 %r6, 0;
mov.s32 %r7, 0;
mov.s32 %r8, 0;
mov.s32 %r9, 0;
mov.s32 %r10, 0;
mov.s32 %r11, 0;
mov.s32 %r12, %r13;
mov.s32 %r14, 0;
$Lt_0_2562:
//<loop> Loop body line 6, nesting depth: 1, iterations: 800
.loc 17 10 0
// 7 int num_inliers;
// 8 for (int j=0;j<800;++j) {
// 9 //Do 12 computations
// 10 num_inliers += j*(j+1);
mul.lo.s32 %r15, %r14, %r14;
add.s32 %r16, %r12, %r14;
add.s32 %r12, %r15, %r16;
.loc 17 11 0
// 11 num_inliers += j*(j+2);
add.s32 %r17, %r15, %r12;
add.s32 %r12, %r1, %r17;
.loc 17 12 0
// 12 num_inliers += j*(j+3);
add.s32 %r18, %r15, %r12;
add.s32 %r12, %r2, %r18;
.loc 17 13 0
// 13 num_inliers += j*(j+4);
add.s32 %r19, %r15, %r12;
add.s32 %r12, %r3, %r19;
.loc 17 14 0
// 14 num_inliers += j*(j+5);
add.s32 %r20, %r15, %r12;
add.s32 %r12, %r4, %r20;
.loc 17 15 0
// 15 num_inliers += j*(j+6);
add.s32 %r21, %r15, %r12;
add.s32 %r12, %r5, %r21;
.loc 17 16 0
// 16 num_inliers += j*(j+7);
add.s32 %r22, %r15, %r12;
add.s32 %r12, %r6, %r22;
.loc 17 17 0
// 17 num_inliers += j*(j+8);
add.s32 %r23, %r15, %r12;
add.s32 %r12, %r7, %r23;
.loc 17 18 0
// 18 num_inliers += j*(j+9);
add.s32 %r24, %r15, %r12;
add.s32 %r12, %r8, %r24;
.loc 17 19 0
// 19 num_inliers += j*(j+10);
add.s32 %r25, %r15, %r12;
add.s32 %r12, %r9, %r25;
.loc 17 20 0
// 20 num_inliers += j*(j+11);
add.s32 %r26, %r15, %r12;
add.s32 %r12, %r10, %r26;
.loc 17 21 0
// 21 num_inliers += j*(j+12);
add.s32 %r27, %r15, %r12;
add.s32 %r12, %r11, %r27;
add.s32 %r14, %r14, 1;
add.s32 %r11, %r11, 12;
add.s32 %r10, %r10, 11;
add.s32 %r9, %r9, 10;
add.s32 %r8, %r8, 9;
add.s32 %r7, %r7, 8;
add.s32 %r6, %r6, 7;
add.s32 %r5, %r5, 6;
add.s32 %r4, %r4, 5;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r28, 1600;
setp.ne.s32 %p1, %r1, %r28;
@%p1 bra $Lt_0_2562;
cvt.u32.u16 %r29, %tid.x;
mov.u32 %r30, -1;
setp.ne.u32 %p2, %r29, %r30;
@%p2 bra $Lt_0_3074;
.loc 17 25 0
// 22 }
// 23
// 24 if (threadIdx.x == -1)
// 25 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r12;
ld.param.u32 %r31, [__cudaparm__Z12less_threadsPf_d_out];
mul24.lo.u32 %r32, %r29, 4;
add.u32 %r33, %r31, %r32;
st.global.f32 [%r33+0], %f1;
$Lt_0_3074:
.loc 17 26 0
// 26 }
exit;
$LDWend__Z12less_threadsPf:
} // _Z12less_threadsPf
.entry _Z12more_threadsPf (
.param .u32 __cudaparm__Z12more_threadsPf_d_out)
{
.reg .u32 %r<19>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 28 0
// 27
// 28 __global__ void more_threads(float *d_out) {
$LBB1__Z12more_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, %r5;
mov.s32 %r6, 0;
$Lt_1_2562:
//<loop> Loop body line 28, nesting depth: 1, iterations: 800
.loc 17 32 0
// 29 int num_inliers;
// 30 for (int j=0;j<800;++j) {
// 31 // Do 4 computations
// 32 num_inliers += j*(j+1);
mul.lo.s32 %r7, %r6, %r6;
add.s32 %r8, %r4, %r6;
add.s32 %r4, %r7, %r8;
.loc 17 33 0
// 33 num_inliers += j*(j+2);
add.s32 %r9, %r7, %r4;
add.s32 %r4, %r1, %r9;
.loc 17 34 0
// 34 num_inliers += j*(j+3);
add.s32 %r10, %r7, %r4;
add.s32 %r4, %r2, %r10;
.loc 17 35 0
// 35 num_inliers += j*(j+4);
add.s32 %r11, %r7, %r4;
add.s32 %r4, %r3, %r11;
add.s32 %r6, %r6, 1;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r12, 1600;
setp.ne.s32 %p1, %r1, %r12;
@%p1 bra $Lt_1_2562;
cvt.u32.u16 %r13, %tid.x;
mov.u32 %r14, -1;
setp.ne.u32 %p2, %r13, %r14;
@%p2 bra $Lt_1_3074;
.loc 17 38 0
// 36 }
// 37 if (threadIdx.x == -1)
// 38 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r4;
ld.param.u32 %r15, [__cudaparm__Z12more_threadsPf_d_out];
mul24.lo.u32 %r16, %r13, 4;
add.u32 %r17, %r15, %r16;
st.global.f32 [%r17+0], %f1;
$Lt_1_3074:
.loc 17 39 0
// 39 }
exit;
$LDWend__Z12more_threadsPf:
} // _Z12more_threadsPf
请注意,两个内核总共应该执行相同的工作量, (if threadIdx.x == -1 是一个阻止编译器优化所有内容的技巧,并且留下一个空内核)。工作应该与 more_threads 相同,使用 4 倍的线程,但每个线程执行的工作量减少 4 倍。
分析器结果的重要结果如下L:
more_threads:GPU运行时间= 1474 us,reg每个线程= 6,占用= 1,分支= 83746,divergent_branch = 26,指令= 584065,gst请求= 1084552
less_threads:GPU运行时间= 921 us,reg per thread = 14,ocupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381
正如我之前所说,使用更多线程的内核的运行时间更长,这可能是由于指令数量增加。
为什么有更多的指令?
为什么会有分支,考虑到没有条件代码,更不用说发散分支了?
在没有全局内存访问的情况下,为什么会有 gst 请求?
这是怎么回事!
感谢
更新
添加了 PTX 代码并修复了 CUDA C,因此它应该可以编译
I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.
I created the following program to illustrate the problem:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
__global__ void less_threads(float * d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
//Do 12 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
num_inliers += j*(j+5);
num_inliers += j*(j+6);
num_inliers += j*(j+7);
num_inliers += j*(j+8);
num_inliers += j*(j+9);
num_inliers += j*(j+10);
num_inliers += j*(j+11);
num_inliers += j*(j+12);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
__global__ void more_threads(float *d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
// Do 4 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
int main(int argc, char* argv[])
{
float *d_out = NULL;
cudaMalloc((void**)&d_out,sizeof(float)*25000);
more_threads<<<780,128>>>(d_out);
less_threads<<<780,32>>>(d_out);
return 0;
}
And the PTX output is:
.entry _Z12less_threadsPf (
.param .u32 __cudaparm__Z12less_threadsPf_d_out)
{
.reg .u32 %r<35>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 6 0
// 2 #include <stdlib.h>
// 3 #include <cuda_runtime.h>
// 4 #include <cutil.h>
// 5
// 6 __global__ void less_threads(float * d_out) {
$LBB1__Z12less_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, 0;
mov.s32 %r5, 0;
mov.s32 %r6, 0;
mov.s32 %r7, 0;
mov.s32 %r8, 0;
mov.s32 %r9, 0;
mov.s32 %r10, 0;
mov.s32 %r11, 0;
mov.s32 %r12, %r13;
mov.s32 %r14, 0;
$Lt_0_2562:
//<loop> Loop body line 6, nesting depth: 1, iterations: 800
.loc 17 10 0
// 7 int num_inliers;
// 8 for (int j=0;j<800;++j) {
// 9 //Do 12 computations
// 10 num_inliers += j*(j+1);
mul.lo.s32 %r15, %r14, %r14;
add.s32 %r16, %r12, %r14;
add.s32 %r12, %r15, %r16;
.loc 17 11 0
// 11 num_inliers += j*(j+2);
add.s32 %r17, %r15, %r12;
add.s32 %r12, %r1, %r17;
.loc 17 12 0
// 12 num_inliers += j*(j+3);
add.s32 %r18, %r15, %r12;
add.s32 %r12, %r2, %r18;
.loc 17 13 0
// 13 num_inliers += j*(j+4);
add.s32 %r19, %r15, %r12;
add.s32 %r12, %r3, %r19;
.loc 17 14 0
// 14 num_inliers += j*(j+5);
add.s32 %r20, %r15, %r12;
add.s32 %r12, %r4, %r20;
.loc 17 15 0
// 15 num_inliers += j*(j+6);
add.s32 %r21, %r15, %r12;
add.s32 %r12, %r5, %r21;
.loc 17 16 0
// 16 num_inliers += j*(j+7);
add.s32 %r22, %r15, %r12;
add.s32 %r12, %r6, %r22;
.loc 17 17 0
// 17 num_inliers += j*(j+8);
add.s32 %r23, %r15, %r12;
add.s32 %r12, %r7, %r23;
.loc 17 18 0
// 18 num_inliers += j*(j+9);
add.s32 %r24, %r15, %r12;
add.s32 %r12, %r8, %r24;
.loc 17 19 0
// 19 num_inliers += j*(j+10);
add.s32 %r25, %r15, %r12;
add.s32 %r12, %r9, %r25;
.loc 17 20 0
// 20 num_inliers += j*(j+11);
add.s32 %r26, %r15, %r12;
add.s32 %r12, %r10, %r26;
.loc 17 21 0
// 21 num_inliers += j*(j+12);
add.s32 %r27, %r15, %r12;
add.s32 %r12, %r11, %r27;
add.s32 %r14, %r14, 1;
add.s32 %r11, %r11, 12;
add.s32 %r10, %r10, 11;
add.s32 %r9, %r9, 10;
add.s32 %r8, %r8, 9;
add.s32 %r7, %r7, 8;
add.s32 %r6, %r6, 7;
add.s32 %r5, %r5, 6;
add.s32 %r4, %r4, 5;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r28, 1600;
setp.ne.s32 %p1, %r1, %r28;
@%p1 bra $Lt_0_2562;
cvt.u32.u16 %r29, %tid.x;
mov.u32 %r30, -1;
setp.ne.u32 %p2, %r29, %r30;
@%p2 bra $Lt_0_3074;
.loc 17 25 0
// 22 }
// 23
// 24 if (threadIdx.x == -1)
// 25 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r12;
ld.param.u32 %r31, [__cudaparm__Z12less_threadsPf_d_out];
mul24.lo.u32 %r32, %r29, 4;
add.u32 %r33, %r31, %r32;
st.global.f32 [%r33+0], %f1;
$Lt_0_3074:
.loc 17 26 0
// 26 }
exit;
$LDWend__Z12less_threadsPf:
} // _Z12less_threadsPf
.entry _Z12more_threadsPf (
.param .u32 __cudaparm__Z12more_threadsPf_d_out)
{
.reg .u32 %r<19>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 28 0
// 27
// 28 __global__ void more_threads(float *d_out) {
$LBB1__Z12more_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, %r5;
mov.s32 %r6, 0;
$Lt_1_2562:
//<loop> Loop body line 28, nesting depth: 1, iterations: 800
.loc 17 32 0
// 29 int num_inliers;
// 30 for (int j=0;j<800;++j) {
// 31 // Do 4 computations
// 32 num_inliers += j*(j+1);
mul.lo.s32 %r7, %r6, %r6;
add.s32 %r8, %r4, %r6;
add.s32 %r4, %r7, %r8;
.loc 17 33 0
// 33 num_inliers += j*(j+2);
add.s32 %r9, %r7, %r4;
add.s32 %r4, %r1, %r9;
.loc 17 34 0
// 34 num_inliers += j*(j+3);
add.s32 %r10, %r7, %r4;
add.s32 %r4, %r2, %r10;
.loc 17 35 0
// 35 num_inliers += j*(j+4);
add.s32 %r11, %r7, %r4;
add.s32 %r4, %r3, %r11;
add.s32 %r6, %r6, 1;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r12, 1600;
setp.ne.s32 %p1, %r1, %r12;
@%p1 bra $Lt_1_2562;
cvt.u32.u16 %r13, %tid.x;
mov.u32 %r14, -1;
setp.ne.u32 %p2, %r13, %r14;
@%p2 bra $Lt_1_3074;
.loc 17 38 0
// 36 }
// 37 if (threadIdx.x == -1)
// 38 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r4;
ld.param.u32 %r15, [__cudaparm__Z12more_threadsPf_d_out];
mul24.lo.u32 %r16, %r13, 4;
add.u32 %r17, %r15, %r16;
st.global.f32 [%r17+0], %f1;
$Lt_1_3074:
.loc 17 39 0
// 39 }
exit;
$LDWend__Z12more_threadsPf:
} // _Z12more_threadsPf
Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.
Significant results form the profiler results are as followsL:
more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552
less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381
As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.
Why are there more instructions?
Why is there any branching, let alone divergent branching, considering there is no conditional code?
Why are there any gst requests when there is no global memory access?
What is going on here!
Thanks
Update
Added PTX code and fixed CUDA C so it should compile
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(3)
这两个函数所做的工作量不同。
more_threads<<<780, 128>>>():
less_threads<<<780, 32>>():
因此,more_threads 比更少的线程执行更多的工作,这就是指令数量增加以及 more_threads 速度更慢的原因。要修复
more_threads
,只需在循环内执行 3 次计算:780*128*800*(3+6) = 718,848,000。The two functions are not doing the same amount of work.
more_threads<<<780, 128>>>():
less_threads<<<780, 32>>>():
So, more_threads is doing more work than less threads, which is why the number of instructions goes up and why more_threads is slower. To fix
more_threads
, do only 3 computations inside the loop: 780*128*800*(3+6) = 718,848,000.由于您的代码只有算术指令,因此您不需要非常高的占用率来隐藏算术单元的延迟。事实上,即使您确实有内存指令,只要您的读/写效率很高,您也可以通过约 50% 的占用率来最大限度地提高性能。有关占用率和性能的更多信息,请参阅录制的高级 CUDA C 演示文稿。
在您的情况下,鉴于您的内核不需要高占用率来使算术单元饱和,因此使用较少的较大块会比使用更多较小的块获得更好的性能,因为启动块是有成本的。然而,一般来说,与实际运行代码的时间相比,启动块的成本可以忽略不计。
为什么有更多指令?
请记住,计数器不是按块(又名 CTA)计数,而是按 SM(流式多处理器)或 TPC(纹理处理集群)(两个或三个一组)进行计数SMS 取决于您的设备。指令数按 SM 计算。
期望 less_threads 内核拥有更少的指令是公平的,但是每个块启动的扭曲数量是原来的四倍,这意味着每个 SM 执行代码的次数大约是原来的四倍。考虑到较短的内核代码,您的测量似乎并非不合理。
为什么会有分支?
实际上你确实有条件代码:
这有一个条件,但是扭曲中的所有线程确实都在执行相同的路径,因此它不会发散。我的猜测是管理代码中存在差异,如果您担心的话,可以查看 PTX 代码来分析这一点。 26 与执行的指令数相比非常低,因此这不会影响您的性能。
为什么会有任何 GST 请求?
在您的代码中,您有:
这将由加载/存储单元处理,因此即使它不会导致实际交易,也会被计数。 gst_32/gst_64/gst_128 计数器指示实际内存传输(您的设备具有计算能力 1.2 或 1.3,较旧的设备具有不同的计数器集)。
Since your code has only arithmetic instructions, you don't need very high occupancy to hide the latency of the arithmetic units. Indeed, even if you do have memory instructions you can maximise performance with ~50% occupancy provided your reads/writes are efficient. See the recorded Advanced CUDA C presentation for more information on occupancy and performance.
In your case, given that your kernel doesn't need high occupancy to saturate the arithmetic units, you will have better performance using fewer larger blocks than more smaller blocks since there is a cost for launching blocks. In general however the cost of launching blocks is negligible compared with the time to actually run the code.
Why are there more instructions?
Remember that the counters are not counting per block (aka CTA) but instead per SM (Streaming Multiprocessor) or per TPC (Texture Processing Cluster) which is a group of two or three SMs depending on your device. The instructions count is per SM.
It is fair to expect the
less_threads
kernel to have fewer instructions, however you are launching four times as many warps per block which means each SM will execute the code approximately four times as many times. Taking into account the shorter kernel code, your measurement doesn't seem unreasonable.Why is there any branching?
Actually you do have conditional code:
This has a condition, however all threads within a warp are indeed executing the same path so it is not divergent. My guess is the divergence is in the administration code somewhere, you could take a look at the PTX code to analyse this if you were worried. 26 is very low compared with the number of instructions executed, so this will not affect your performance.
Why are there any gst requests?
In your code you have:
This will be handled by the load/store unit and hence counted even though it results in no actual transaction. The gst_32/gst_64/gst_128 counters indicate actual memory transfers (your device has compute capability 1.2 or 1.3, older devices have different sets of counters).
两个函数的代码行数不同,因此指令数不同
for循环是通过分支实现的。最后一行代码总是发散
全局存储请求与全局分数不同。操作已设置,但从未提交。
two functions have different number of code lines, so different number of instructions
for loop is implemented using branches. last line of code always divergent
global store request is not the same as global score. operation is set up, but never commited.