CUDA 中共享内存的原子操作
我使用 GTX 280,其计算能力为 1.3,并支持共享内存上的原子操作。我正在使用 cuda SDK 2.2 和 VS 2005。在我的程序中,我必须广泛使用原子操作,因为根本没有其他方法。
一个例子是,我必须计算数组的运行总和,并找出总和超过给定截止值的索引。为此,我使用 scan算法并在值小于阈值时使用atomicMin
存储索引。因此,最终共享内存将具有值刚好小于阈值的索引。
这只是内核的一个组件,内核调用中还有很多类似的代码块。
我遇到了 3 个问题
- 首先,我无法编译代码,因为它说原子操作未定义,我已经搜索但没有找到我必须添加哪个文件。
- 其次,我设法通过将代码复制到 CUDA SDK 提供的代码中来编译代码,但随后它说共享内存上不支持原子操作,因为它在以下 program
- 即使我通过给出解决方案
-arch sm_12
在命令行编译中,使用这些原子操作的代码片段花费了非常多的时间。
我相信在最坏的情况下我应该得到某种程度的加速,因为没有太多的原子操作并且我使用了 1 个 16x16 的块。不幸的是,串行代码的运行速度快了 10 倍。
下面我发布了内核 cod*,这个内核调用似乎是瓶颈,如果有人可以帮助我优化那就太好了。串行代码只是以串行方式执行这些操作。我使用的是 16 X 16 的块配置。
代码看起来很长,但实际上它包含执行几乎相同任务的 if 代码块和 while 代码块,但它们无法合并。
#define limit (int)(log((float)256)/log((float)2))
// This receives a pointer to an image, some variables and 4 more arrays cont(of size 256) vars(some constants), lim and buf(of image size)
// block configuration 1 block of 16x16
__global__ void kernel_Main(unsigned char* in, int height,int width, int bs,int th, double cutoff, uint* cont,int* vars, unsigned int* lim,unsigned int* buf)
{
int j = threadIdx.x;
int i = threadIdx.y;
int k = i*blockDim.x+j;
__shared__ int prefix_sum[256];
__shared__ int sum_s[256];
__shared__ int ary_shared[256];
__shared__ int he_shared[256];
// this is the threshold
int cutval = (2*width*height)*cutoff;
prefix_sum[k] = cont[k];
int l;
// a variant of scan algorithm
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > cutval)
{
atomicMin(&vars[cut],k);
}
}
__syncthreads();
}
// The first thread will store the value in global array
if(k==0)
{
vars[cuts]=prefix_sum[vars[cut]];
}
__syncthreads();
if(vars[n])
{
// bs = 7 in this case
if(i<bs && j<bs)
{
// using atomic add because the index could be same for 2 different threads
atomicAdd(&ary_shared[in[i*(width) + j]],1);
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
sum_s[k] = 0;
// Again prefix sum
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
// because we will always overshoot by 1
vars[hmin]--;
}
__syncthreads();
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if( k <= vars[hmin] )
he_shared[k] = 0;
else if( k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin])) / rng;
}
__syncthreads();
// only 7x7 = 49 threads will do this
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
if(k==0)
vars[n]--;
__syncthreads();
}// if(n) block closes here
while(vars[n])
{
if(k==0)
{
if( vars[ox]==0 && vars[d1] ==3 )
vars[d1] = 0; // l2r
else if( vars[ox]==0 && vars[d1]==2 )
vars[d1] = 3; // l u2d
else if( vars[ox]==width-bs && vars[d1]==0)
vars[d1] = 1; // r u2d
else if( vars[ox]==width-bs && vars[d1]==1)
vars[d1] = 2; // r2l
}
// Because this value will be changed so
// all the threads should set their registers before
// they move forward
int ox_d = vars[ox];
int oy_d = vars[oy];
// Just putting it here so that all the threads should have set their
// values before moving on, as this value will be changed
__syncthreads();
if(vars[d1]==0)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + oy_d*width +bs;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[ox]++;
}
else if(vars[d1]==1||vars[d1]==3)
{
if(i == 0 && j < bs)
{
/*if(j==0)
printf("Entered 1||3\n");*/
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + (oy_d+bs)*width;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[oy]++;
}
else if(vars[d1]==2)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d-1 + oy_d*width;
int index2 = j*width + ox_d-1 + oy_d*width +bs;
atomicAdd(&ary_shared[in[index]],1);
atomicSub(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0 )
vars[ox]--;
}
__syncthreads();
//ary_shared has been calculated
// Reset the hmin and hminc values
// again the same task as done in the if(n) loop
if(k==0)
{
vars[hmin]=0;
vars[hminc]=0;
vars[hmax]=0;
vars[hmaxc]=0;
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
vars[hmin]--;
}
__syncthreads();
// Calculate maxth
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if( k <= vars[hmin] )
he_shared[k] = 0;
else if( k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin])) / rng;
}
__syncthreads();
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
// This just might cause a little bit of problem
if(k==0)
vars[n]--;
// All threads will wait here before continuing the while loop
__syncthreads();
}// end of while(n)
}
I use a GTX 280, which has compute capability 1.3 and supports atomic operations on shared memory. I am using cuda SDK 2.2 and VS 2005. In my program I have to extensively use atomic operations because there is simply no other way.
One example is that I have to calculate the running sum of an array and find out the index where the sum exceeds a given cut off value. For this I am using a variant of scan algorithm and using atomicMin
to store index while the value is less than the threshold. So this way at the end the shared memory would have the index where the value is just less than the threshold.
This is just one component of the kernel, and there are many similar code blocks in the kernel call.
I am having 3 problems
- Firstly I have not been able to compile the code as it say atomic operations are not defined, I have searched but not found which file I have to add.
- Second, I somehow managed to compile the code by copying it in the code provided by CUDA SDK, but then it is saying the atomic operations are not supported on shared memory, where as it is running in the following program
- Even when I worked around a hack by giving
-arch sm_12
in the command line compilation, the code snippet using these atomic operations are taking an awful lot of time.
I believe that in the worst case I should get some sort of speed up, because there are not very many atomic operations and I using 1 block of 16x16. Unfortunately the serial code in running 10x faster.
Below I am posting the kernel cod*, this kernel call seems to be the bottleneck if anyone could help me optimize then it would be nice. The serial code is just performing these actions in a serial manner. I am using a block configuration of 16 X 16.
The code seems to be lengthy but actually it contains an if code block and while code block that perform almost the same task, but they could not be merged.
#define limit (int)(log((float)256)/log((float)2))
// This receives a pointer to an image, some variables and 4 more arrays cont(of size 256) vars(some constants), lim and buf(of image size)
// block configuration 1 block of 16x16
__global__ void kernel_Main(unsigned char* in, int height,int width, int bs,int th, double cutoff, uint* cont,int* vars, unsigned int* lim,unsigned int* buf)
{
int j = threadIdx.x;
int i = threadIdx.y;
int k = i*blockDim.x+j;
__shared__ int prefix_sum[256];
__shared__ int sum_s[256];
__shared__ int ary_shared[256];
__shared__ int he_shared[256];
// this is the threshold
int cutval = (2*width*height)*cutoff;
prefix_sum[k] = cont[k];
int l;
// a variant of scan algorithm
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > cutval)
{
atomicMin(&vars[cut],k);
}
}
__syncthreads();
}
// The first thread will store the value in global array
if(k==0)
{
vars[cuts]=prefix_sum[vars[cut]];
}
__syncthreads();
if(vars[n])
{
// bs = 7 in this case
if(i<bs && j<bs)
{
// using atomic add because the index could be same for 2 different threads
atomicAdd(&ary_shared[in[i*(width) + j]],1);
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
sum_s[k] = 0;
// Again prefix sum
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
// because we will always overshoot by 1
vars[hmin]--;
}
__syncthreads();
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if( k <= vars[hmin] )
he_shared[k] = 0;
else if( k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin])) / rng;
}
__syncthreads();
// only 7x7 = 49 threads will do this
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
if(k==0)
vars[n]--;
__syncthreads();
}// if(n) block closes here
while(vars[n])
{
if(k==0)
{
if( vars[ox]==0 && vars[d1] ==3 )
vars[d1] = 0; // l2r
else if( vars[ox]==0 && vars[d1]==2 )
vars[d1] = 3; // l u2d
else if( vars[ox]==width-bs && vars[d1]==0)
vars[d1] = 1; // r u2d
else if( vars[ox]==width-bs && vars[d1]==1)
vars[d1] = 2; // r2l
}
// Because this value will be changed so
// all the threads should set their registers before
// they move forward
int ox_d = vars[ox];
int oy_d = vars[oy];
// Just putting it here so that all the threads should have set their
// values before moving on, as this value will be changed
__syncthreads();
if(vars[d1]==0)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + oy_d*width +bs;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[ox]++;
}
else if(vars[d1]==1||vars[d1]==3)
{
if(i == 0 && j < bs)
{
/*if(j==0)
printf("Entered 1||3\n");*/
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + (oy_d+bs)*width;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[oy]++;
}
else if(vars[d1]==2)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d-1 + oy_d*width;
int index2 = j*width + ox_d-1 + oy_d*width +bs;
atomicAdd(&ary_shared[in[index]],1);
atomicSub(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0 )
vars[ox]--;
}
__syncthreads();
//ary_shared has been calculated
// Reset the hmin and hminc values
// again the same task as done in the if(n) loop
if(k==0)
{
vars[hmin]=0;
vars[hminc]=0;
vars[hmax]=0;
vars[hmaxc]=0;
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
vars[hmin]--;
}
__syncthreads();
// Calculate maxth
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if( k <= vars[hmin] )
he_shared[k] = 0;
else if( k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin])) / rng;
}
__syncthreads();
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
// This just might cause a little bit of problem
if(k==0)
vars[n]--;
// All threads will wait here before continuing the while loop
__syncthreads();
}// end of while(n)
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
首先,您需要
-arch sm_12
(或者在您的情况下它实际上应该是-arch sm_13
)来启用原子操作。至于性能,无法保证您的内核会比 CPU 上的正常代码更快 - 有许多问题确实不太适合 CUDA 模型,并且这些问题确实可能比在 CPU 上运行慢得多。在编写任何 CUDA 内核之前,您需要进行一些分析/设计/建模,以防止自己在永远不会成功的事情上浪费大量时间。
话虽如此,也许有一种方法可以更有效地实现您的算法 - 也许您可以发布 CPU 代码,然后邀请有关如何在 CUDA 中有效实现它的想法?
Firstly you need
-arch sm_12
(or in your case it should really be-arch sm_13
) to enable atomic operations.As for performance, there is no guarantee that your kernel will be any faster than normal code on the CPU - there are many problems which really do not fit well into the CUDA model and these may indeed run much slower than on the CPU. You need to do some analysis/design/modelling before coding any CUDA kernels to prevent yourself wasting a lot of time on something that is never going to fly.
Having said that, there may be a way to implement your algo in a more efficient way - maybe you could post the CPU code and then invite ideas as to how to efficiently implement it in CUDA ?