当前位置：文江博客话题详情

CUDA 中共享内存的原子操作

发布于 2024-12-09 00:37:18 字数 10615 浏览 0 评论 0原文

我使用 GTX 280，其计算能力为 1.3，并支持共享内存上的原子操作。我正在使用 cuda SDK 2.2 和 VS 2005。在我的程序中，我必须广泛使用原子操作，因为根本没有其他方法。

一个例子是，我必须计算数组的运行总和，并找出总和超过给定截止值的索引。为此，我使用 scan算法并在值小于阈值时使用atomicMin存储索引。因此，最终共享内存将具有值刚好小于阈值的索引。

这只是内核的一个组件，内核调用中还有很多类似的代码块。

我遇到了 3 个问题

首先，我无法编译代码，因为它说原子操作未定义，我已经搜索但没有找到我必须添加哪个文件。
其次，我设法通过将代码复制到 CUDA SDK 提供的代码中来编译代码，但随后它说共享内存上不支持原子操作，因为它在以下 program
即使我通过给出解决方案-arch sm_12 在命令行编译中，使用这些原子操作的代码片段花费了非常多的时间。

我相信在最坏的情况下我应该得到某种程度的加速，因为没有太多的原子操作并且我使用了 1 个 16x16 的块。不幸的是，串行代码的运行速度快了 10 倍。

下面我发布了内核 cod*，这个内核调用似乎是瓶颈，如果有人可以帮助我优化那就太好了。串行代码只是以串行方式执行这些操作。我使用的是 16 X 16 的块配置。

代码看起来很长，但实际上它包含执行几乎相同任务的 if 代码块和 while 代码块，但它们无法合并。

#define limit (int)(log((float)256)/log((float)2))

// This receives a pointer to an image, some variables and 4 more arrays cont(of size 256) vars(some constants), lim and buf(of image size)
// block configuration 1 block of 16x16

__global__ void kernel_Main(unsigned char* in, int height,int width, int bs,int th, double cutoff, uint* cont,int* vars, unsigned int* lim,unsigned int* buf)
{  

    int j = threadIdx.x;
    int i = threadIdx.y;

    int k = i*blockDim.x+j;


    __shared__  int prefix_sum[256];  
    __shared__  int sum_s[256];
    __shared__  int ary_shared[256];
    __shared__  int he_shared[256];

    // this is the threshold
    int cutval = (2*width*height)*cutoff;
    prefix_sum[k] = cont[k];

    int l;
    // a variant of scan algorithm 
    for(l=0;l<=limit;l++)
    {
        sum_s[k]=prefix_sum[k];

        if(k >= (int)pow((float)2,(float)l))
        {  
            prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
            // Find out the minimum index for which the cummulative sum crosses threshold
            if(prefix_sum[k] > cutval)
            {
                atomicMin(&vars[cut],k);
            }
        }
        __syncthreads();
    }

    // The first thread will store the value in global array
    if(k==0)
    {
        vars[cuts]=prefix_sum[vars[cut]];
    }
    __syncthreads();


    if(vars[n])
    {
        // bs = 7 in this case
        if(i<bs && j<bs)
        {
            // using atomic add because the index could be same for 2 different threads
            atomicAdd(&ary_shared[in[i*(width) + j]],1);  
        }
        __syncthreads();


        int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[k];
        sum_s[k] = 0;

        // Again prefix sum

        int l;
        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > minth)
                {
                    atomicMin(&vars[hmin],k);
                }
            }
            __syncthreads();
        }

         // set the maximum value here
        if(k==0)
        {
            vars[hminc]=prefix_sum[255];
            // because we will always overshoot by 1
            vars[hmin]--;
        }

        __syncthreads();

        int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[255-k];

        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > maxth)
                {
                    atomicMin(&vars[hmax], k);
                }
            }
             __syncthreads();
        }
         // set the maximum value here

         if(k==0)
         {
            vars[hmaxc]=prefix_sum[255];
            vars[hmax]--;
            vars[hmax]=255-vars[hmax];

         }
        __syncthreads();



        int rng = vars[hmax] - vars[hmin];
        if(rng >= vars[cut])
        {
          if( k <= vars[hmin] )
                he_shared[k] = 0;
          else if( k >= vars[hmax])
                he_shared[k] = 255;
          else
                he_shared[k] = (255 * (k - vars[hmin])) / rng;
        }
         __syncthreads();

        // only 7x7 = 49 threads will do this
        if(i>0 && i<=bs && j>0 && j<=bs)
        {
           int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);

           if(rng >= vars[cut])
           {
              int value = he_shared[in[base]];
              buf[base]+=value;
              lim[base]++;
           }
           else
           {
              buf[base]+=255;
              lim[base]++;
           }
        }

        if(k==0)
            vars[n]--;

        __syncthreads();   


    }// if(n) block closes here

    while(vars[n])
    {


        if(k==0)
        {
            if( vars[ox]==0 && vars[d1] ==3 )
                vars[d1] = 0; // l2r
            else if( vars[ox]==0 && vars[d1]==2 )
                vars[d1] = 3; // l u2d
            else if( vars[ox]==width-bs && vars[d1]==0)
                vars[d1] = 1; // r u2d
            else if( vars[ox]==width-bs && vars[d1]==1)
                vars[d1] = 2; // r2l

        }

        // Because this value will be changed so
        // all the threads should set their registers before
        // they move forward
        int ox_d = vars[ox];
        int oy_d = vars[oy];

        // Just putting it here so that all the threads should have set their
        // values before moving on, as this value will be changed
        __syncthreads();

        if(vars[d1]==0)
        {

            if(i == 0 && j < bs)
            {
                int index = j*width + ox_d + oy_d*width;
                int index2 = j*width + ox_d + oy_d*width +bs;

                atomicSub(&ary_shared[in[index]],1);
                atomicAdd(&ary_shared[in[index2]],1);
            }

            // The first thread of the first block should set this value
            if(k==0)
                vars[ox]++;
        }
        else if(vars[d1]==1||vars[d1]==3)
        {

            if(i == 0 && j < bs)
            {
                /*if(j==0)
                printf("Entered 1||3\n");*/
                int index = j*width + ox_d + oy_d*width;
                int index2 = j*width + ox_d + (oy_d+bs)*width;

                atomicSub(&ary_shared[in[index]],1);
                atomicAdd(&ary_shared[in[index2]],1);

            }
            // The first thread of the first block should set this value
            if(k==0)
                vars[oy]++;

        }
        else if(vars[d1]==2)
        {

            if(i == 0 && j < bs)
            {
                int index = j*width + ox_d-1 + oy_d*width;
                int index2 = j*width + ox_d-1 + oy_d*width +bs;

                atomicAdd(&ary_shared[in[index]],1);
                atomicSub(&ary_shared[in[index2]],1);

            }
            // The first thread of the first block should set this value
            if(k==0 )
                vars[ox]--;
         }
        __syncthreads();

        //ary_shared has been calculated

        // Reset the hmin and hminc values
        // again the same task as done in the if(n) loop
        if(k==0)
        {
            vars[hmin]=0;
            vars[hminc]=0;
            vars[hmax]=0;
            vars[hmaxc]=0;
        }
        __syncthreads();

        int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[k];

        int l;
        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > minth)
                {
                    atomicMin(&vars[hmin],k);
                }
            }
            __syncthreads();
        }

         // set the maximum value here
        if(k==0)
        {
            vars[hminc]=prefix_sum[255];
            vars[hmin]--;
        }
        __syncthreads();

        // Calculate maxth
        int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[255-k];

        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > maxth)
                {
                    atomicMin(&vars[hmax], k);
                }
            }
             __syncthreads();
        }
         // set the maximum value here

         if(k==0)
         {
            vars[hmaxc]=prefix_sum[255];
            vars[hmax]--;
            vars[hmax]=255-vars[hmax];
         }
        __syncthreads();

        int rng = vars[hmax] - vars[hmin];
        if(rng >= vars[cut])
        {
          if( k <= vars[hmin] )
                he_shared[k] = 0;
          else if( k >= vars[hmax])
                he_shared[k] = 255;
          else
                he_shared[k] = (255 * (k - vars[hmin])) / rng;
        }
         __syncthreads();


        if(i>0 && i<=bs && j>0 && j<=bs)
        {
           int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);

           if(rng >= vars[cut])
           {
              int value = he_shared[in[base]];
              buf[base]+=value;
              lim[base]++;
           }
           else
           {

              buf[base]+=255;
              lim[base]++;

           }
        }

        // This just might cause a little bit of problem
        if(k==0)
            vars[n]--;

        // All threads will wait here before continuing the while loop
        __syncthreads();

    }// end of while(n)
}

原文

I use a GTX 280, which has compute capability 1.3 and supports atomic operations on shared memory. I am using cuda SDK 2.2 and VS 2005. In my program I have to extensively use atomic operations because there is simply no other way.

One example is that I have to calculate the running sum of an array and find out the index where the sum exceeds a given cut off value. For this I am using a variant of scan algorithm and using atomicMin to store index while the value is less than the threshold. So this way at the end the shared memory would have the index where the value is just less than the threshold.

This is just one component of the kernel, and there are many similar code blocks in the kernel call.

I am having 3 problems

Firstly I have not been able to compile the code as it say atomic operations are not defined, I have searched but not found which file I have to add.
Second, I somehow managed to compile the code by copying it in the code provided by CUDA SDK, but then it is saying the atomic operations are not supported on shared memory, where as it is running in the following program
Even when I worked around a hack by giving -arch sm_12 in the command line compilation, the code snippet using these atomic operations are taking an awful lot of time.

I believe that in the worst case I should get some sort of speed up, because there are not very many atomic operations and I using 1 block of 16x16. Unfortunately the serial code in running 10x faster.

Below I am posting the kernel cod*, this kernel call seems to be the bottleneck if anyone could help me optimize then it would be nice. The serial code is just performing these actions in a serial manner. I am using a block configuration of 16 X 16.

The code seems to be lengthy but actually it contains an if code block and while code block that perform almost the same task, but they could not be merged.

#define limit (int)(log((float)256)/log((float)2))

// This receives a pointer to an image, some variables and 4 more arrays cont(of size 256) vars(some constants), lim and buf(of image size)
// block configuration 1 block of 16x16

__global__ void kernel_Main(unsigned char* in, int height,int width, int bs,int th, double cutoff, uint* cont,int* vars, unsigned int* lim,unsigned int* buf)
{  

    int j = threadIdx.x;
    int i = threadIdx.y;

    int k = i*blockDim.x+j;


    __shared__  int prefix_sum[256];  
    __shared__  int sum_s[256];
    __shared__  int ary_shared[256];
    __shared__  int he_shared[256];

    // this is the threshold
    int cutval = (2*width*height)*cutoff;
    prefix_sum[k] = cont[k];

    int l;
    // a variant of scan algorithm 
    for(l=0;l<=limit;l++)
    {
        sum_s[k]=prefix_sum[k];

        if(k >= (int)pow((float)2,(float)l))
        {  
            prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
            // Find out the minimum index for which the cummulative sum crosses threshold
            if(prefix_sum[k] > cutval)
            {
                atomicMin(&vars[cut],k);
            }
        }
        __syncthreads();
    }

    // The first thread will store the value in global array
    if(k==0)
    {
        vars[cuts]=prefix_sum[vars[cut]];
    }
    __syncthreads();


    if(vars[n])
    {
        // bs = 7 in this case
        if(i<bs && j<bs)
        {
            // using atomic add because the index could be same for 2 different threads
            atomicAdd(&ary_shared[in[i*(width) + j]],1);  
        }
        __syncthreads();


        int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[k];
        sum_s[k] = 0;

        // Again prefix sum

        int l;
        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > minth)
                {
                    atomicMin(&vars[hmin],k);
                }
            }
            __syncthreads();
        }

         // set the maximum value here
        if(k==0)
        {
            vars[hminc]=prefix_sum[255];
            // because we will always overshoot by 1
            vars[hmin]--;
        }

        __syncthreads();

        int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[255-k];

        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > maxth)
                {
                    atomicMin(&vars[hmax], k);
                }
            }
             __syncthreads();
        }
         // set the maximum value here

         if(k==0)
         {
            vars[hmaxc]=prefix_sum[255];
            vars[hmax]--;
            vars[hmax]=255-vars[hmax];

         }
        __syncthreads();



        int rng = vars[hmax] - vars[hmin];
        if(rng >= vars[cut])
        {
          if( k <= vars[hmin] )
                he_shared[k] = 0;
          else if( k >= vars[hmax])
                he_shared[k] = 255;
          else
                he_shared[k] = (255 * (k - vars[hmin])) / rng;
        }
         __syncthreads();

        // only 7x7 = 49 threads will do this
        if(i>0 && i<=bs && j>0 && j<=bs)
        {
           int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);

           if(rng >= vars[cut])
           {
              int value = he_shared[in[base]];
              buf[base]+=value;
              lim[base]++;
           }
           else
           {
              buf[base]+=255;
              lim[base]++;
           }
        }

        if(k==0)
            vars[n]--;

        __syncthreads();   


    }// if(n) block closes here

    while(vars[n])
    {


        if(k==0)
        {
            if( vars[ox]==0 && vars[d1] ==3 )
                vars[d1] = 0; // l2r
            else if( vars[ox]==0 && vars[d1]==2 )
                vars[d1] = 3; // l u2d
            else if( vars[ox]==width-bs && vars[d1]==0)
                vars[d1] = 1; // r u2d
            else if( vars[ox]==width-bs && vars[d1]==1)
                vars[d1] = 2; // r2l

        }

        // Because this value will be changed so
        // all the threads should set their registers before
        // they move forward
        int ox_d = vars[ox];
        int oy_d = vars[oy];

        // Just putting it here so that all the threads should have set their
        // values before moving on, as this value will be changed
        __syncthreads();

        if(vars[d1]==0)
        {

            if(i == 0 && j < bs)
            {
                int index = j*width + ox_d + oy_d*width;
                int index2 = j*width + ox_d + oy_d*width +bs;

                atomicSub(&ary_shared[in[index]],1);
                atomicAdd(&ary_shared[in[index2]],1);
            }

            // The first thread of the first block should set this value
            if(k==0)
                vars[ox]++;
        }
        else if(vars[d1]==1||vars[d1]==3)
        {

            if(i == 0 && j < bs)
            {
                /*if(j==0)
                printf("Entered 1||3\n");*/
                int index = j*width + ox_d + oy_d*width;
                int index2 = j*width + ox_d + (oy_d+bs)*width;

                atomicSub(&ary_shared[in[index]],1);
                atomicAdd(&ary_shared[in[index2]],1);

            }
            // The first thread of the first block should set this value
            if(k==0)
                vars[oy]++;

        }
        else if(vars[d1]==2)
        {

            if(i == 0 && j < bs)
            {
                int index = j*width + ox_d-1 + oy_d*width;
                int index2 = j*width + ox_d-1 + oy_d*width +bs;

                atomicAdd(&ary_shared[in[index]],1);
                atomicSub(&ary_shared[in[index2]],1);

            }
            // The first thread of the first block should set this value
            if(k==0 )
                vars[ox]--;
         }
        __syncthreads();

        //ary_shared has been calculated

        // Reset the hmin and hminc values
        // again the same task as done in the if(n) loop
        if(k==0)
        {
            vars[hmin]=0;
            vars[hminc]=0;
            vars[hmax]=0;
            vars[hmaxc]=0;
        }
        __syncthreads();

        int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[k];

        int l;
        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > minth)
                {
                    atomicMin(&vars[hmin],k);
                }
            }
            __syncthreads();
        }

         // set the maximum value here
        if(k==0)
        {
            vars[hminc]=prefix_sum[255];
            vars[hmin]--;
        }
        __syncthreads();

        // Calculate maxth
        int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
        prefix_sum[k] = ary_shared[255-k];

        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];

            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > maxth)
                {
                    atomicMin(&vars[hmax], k);
                }
            }
             __syncthreads();
        }
         // set the maximum value here

         if(k==0)
         {
            vars[hmaxc]=prefix_sum[255];
            vars[hmax]--;
            vars[hmax]=255-vars[hmax];
         }
        __syncthreads();

        int rng = vars[hmax] - vars[hmin];
        if(rng >= vars[cut])
        {
          if( k <= vars[hmin] )
                he_shared[k] = 0;
          else if( k >= vars[hmax])
                he_shared[k] = 255;
          else
                he_shared[k] = (255 * (k - vars[hmin])) / rng;
        }
         __syncthreads();


        if(i>0 && i<=bs && j>0 && j<=bs)
        {
           int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);

           if(rng >= vars[cut])
           {
              int value = he_shared[in[base]];
              buf[base]+=value;
              lim[base]++;
           }
           else
           {

              buf[base]+=255;
              lim[base]++;

           }
        }

        // This just might cause a little bit of problem
        if(k==0)
            vars[n]--;

        // All threads will wait here before continuing the while loop
        __syncthreads();

    }// end of while(n)
}

分享到QQ

分享到微博