尽管代码中没有双精度，Cuda 将双精度降级为浮点错误

发布于 2024-12-18 15:07:58 字数 8737 浏览 5 评论 0原文

我正在使用 PyCUDA 编写内核。我的 GPU 设备仅支持计算能力 1.1 (arch sm_11)，因此我只能在代码中使用浮点数。我已经付出了巨大的努力来确保我使用浮点数完成所有操作，但尽管如此，我的代码中仍有一个特定的行不断导致编译器错误。

代码块是：

  // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height. 
  if( j > 0 && j < im_width && i > 0 && i < im_height){
    gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
  }

这里，idx()是一个__device__辅助函数，它返回基于像素索引i和的线性索引>j，并且它仅适用于整数。我自始至终都使用它，并且在其他地方都没有给出错误，所以我强烈怀疑它不是 idx() 。 sqrt() 调用仅来自支持浮点数的标准 C 数学函数。所有涉及的数组，x_gradient、y_gradient 和 gradient_mag 都是 float* 并且它们是我的函数的输入（即在Python中声明，然后转换为设备变量等）。

我尝试删除上面代码中浮动的额外强制转换，但没有成功。我也尝试过做一些完全愚蠢的事情，如下所示：

 // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height. 
 if( j > 0 && j < im_width && i > 0 && i < im_height){
    gradient_mag[idx(i,j)] = 3.0f; // also tried float(3.0) here
  }

所有这些变体都会给出相同的错误：

 pycuda.driver.CompileError: nvcc said it demoted types in source code it compiled--this is likely not what you want.
 [command: nvcc --cubin -arch sm_11 -I/usr/local/lib/python2.7/dist-packages/pycuda-2011.1.2-py2.7-linux-x86_64.egg/pycuda/../include/pycuda kernel.cu]
 [stderr:
 ptxas /tmp/tmpxft_00004329_00000000-2_kernel.ptx, line 128; warning : Double is not supported. Demoting to float
 ]

有什么想法吗？我已经调试了代码中的许多错误，并希望今晚能够使其正常工作，但事实证明这是一个我无法理解的错误。

已添加 - 这是内核的截断版本，它在我的机器上产生与上面相同的错误。

 every_pixel_hog_kernel_source = \
 """
 #include <math.h>
 #include <stdio.h>

 __device__ int idx(int ii, int jj){
     return gridDim.x*blockDim.x*ii+jj;
 }

 __device__ int bin_number(float angle_val, int total_angles, int num_bins){ 

     float angle1;   
     float min_dist;
     float this_dist;
     int bin_indx;

     angle1 = 0.0;
     min_dist = abs(angle_val - angle1);
     bin_indx = 0;

     for(int kk=1; kk < num_bins; kk++){
         angle1 = angle1 + float(total_angles)/float(num_bins);
         this_dist = abs(angle_val - angle1);
         if(this_dist < min_dist){
             min_dist = this_dist;
             bin_indx = kk;
         }
     }

     return bin_indx;
 }

 __device__ int hist_number(int ii, int jj){

     int hist_num = 0;

     if(jj >= 0 && jj < 11){ 
         if(ii >= 0 && ii < 11){ 
             hist_num = 0;
         }
         else if(ii >= 11 && ii < 22){
             hist_num = 3;
         }
         else if(ii >= 22 && ii < 33){
             hist_num = 6;
         }
     }
     else if(jj >= 11 && jj < 22){
         if(ii >= 0 && ii < 11){ 
             hist_num = 1;
         }
         else if(ii >= 11 && ii < 22){
             hist_num = 4;
         }
         else if(ii >= 22 && ii < 33){
             hist_num = 7;
         }
     }
     else if(jj >= 22 && jj < 33){
         if(ii >= 0 && ii < 11){ 
             hist_num = 2;
         }
         else if(ii >= 11 && ii < 22){
             hist_num = 5;
         }
         else if(ii >= 22 && ii < 33){
             hist_num = 8;
         }
     }

     return hist_num;
 }

  __global__ void every_pixel_hog_kernel(float* input_image, int im_width, int im_height, float* gaussian_array, float* x_gradient, float* y_gradient, float* gradient_mag, float* angles, float* output_array)
  {    
      /////
      // Setup the thread indices and linear offset.
      /////
      int i = blockDim.y * blockIdx.y + threadIdx.y;
      int j = blockDim.x * blockIdx.x + threadIdx.x;
      int ang_limit = 180;
      int ang_bins = 9;
      float pi_val = 3.141592653589f; //91

      /////
      // Compute a Gaussian smoothing of the current pixel and save it into a new image array
      // Use sync threads to make sure everyone does the Gaussian smoothing before moving on.
      /////
      if( j > 1 && i > 1 && j < im_width-2 && i < im_height-2 ){

            // Hard-coded unit standard deviation 5-by-5 Gaussian smoothing filter.
            gaussian_array[idx(i,j)] = float(1.0/273.0) *(
            input_image[idx(i-2,j-2)] + float(4.0)*input_image[idx(i-2,j-1)] + float(7.0)*input_image[idx(i-2,j)] + float(4.0)*input_image[idx(i-2,j+1)] + input_image[idx(i-2,j+2)] + 
            float(4.0)*input_image[idx(i-1,j-2)] + float(16.0)*input_image[idx(i-1,j-1)] + float(26.0)*input_image[idx(i-1,j)] + float(16.0)*input_image[idx(i-1,j+1)] + float(4.0)*input_image[idx(i-1,j+2)] +
            float(7.0)*input_image[idx(i,j-2)] + float(26.0)*input_image[idx(i,j-1)] + float(41.0)*input_image[idx(i,j)] + float(26.0)*input_image[idx(i,j+1)] + float(7.0)*input_image[idx(i,j+2)] +
            float(4.0)*input_image[idx(i+1,j-2)] + float(16.0)*input_image[idx(i+1,j-1)] + float(26.0)*input_image[idx(i+1,j)] + float(16.0)*input_image[idx(i+1,j+1)] + float(4.0)*input_image[idx(i+1,j+2)] +
            input_image[idx(i+2,j-2)] + float(4.0)*input_image[idx(i+2,j-1)] + float(7.0)*input_image[idx(i+2,j)] + float(4.0)*input_image[idx(i+2,j+1)] + input_image[idx(i+2,j+2)]);
     }
     __syncthreads();

     /////
     // Compute the simple x and y gradients of the image and store these into new images
     // again using syncthreads before moving on.
     /////

     // X-gradient, ensure x is between 1 and width-1
     if( j > 0 && j < im_width){
         x_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i,j-1)]);
     }
     else if(j == 0){
         x_gradient[idx(i,j)] = float(0.0);
     }

    // Y-gradient, ensure y is between 1 and height-1
    if( i > 0 && i < im_height){
         y_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i-1,j)]);
    }
    else if(i == 0){
        y_gradient[idx(i,j)] = float(0.0);
    }
    __syncthreads();

    // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height. 
    if( j < im_width && i < im_height){

        gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
    }
    __syncthreads();

    /////
    // Compute the orientation angles
    /////
    if( j < im_width && i < im_height){
        if(ang_limit == 360){
            angles[idx(i,j)] = float((atan2(y_gradient[idx(i,j)],x_gradient[idx(i,j)])+pi_val)*float(180.0)/pi_val);
        }
        else{
            angles[idx(i,j)] = float((atan( y_gradient[idx(i,j)]/x_gradient[idx(i,j)] )+(pi_val/float(2.0)))*float(180.0)/pi_val);
        }
    }
    __syncthreads();

    // Compute the HoG using the above arrays. Do so in a 3x3 grid, with 9 angle bins for each grid.
    // forming an 81-vector and then write this 81 vector as a row in the large output array.

    int top_bound, bot_bound, left_bound, right_bound, offset;
    int window = 32;

    if(i-window/2 > 0){
        top_bound = i-window/2;
        bot_bound = top_bound + window;
    }
    else{
        top_bound = 0;
        bot_bound = top_bound + window;
    }

    if(j-window/2 > 0){
        left_bound = j-window/2;
        right_bound = left_bound + window;
    }
    else{
        left_bound = 0;
        right_bound = left_bound + window;
    }

    if(bot_bound - im_height > 0){
        offset = bot_bound - im_height;
        top_bound = top_bound - offset;
        bot_bound = bot_bound - offset;
    }

    if(right_bound - im_width > 0){
        offset = right_bound - im_width;
        right_bound = right_bound - offset;
        left_bound = left_bound - offset;
    }

    int counter_i = 0;
    int counter_j = 0;
    int bin_indx, hist_indx, glob_col_indx, glob_row_indx;
    int row_width = 81; 

    for(int pix_i = top_bound; pix_i < bot_bound; pix_i++){
        for(int pix_j = left_bound; pix_j < right_bound; pix_j++){

            bin_indx = bin_number(angles[idx(pix_i,pix_j)], ang_limit, ang_bins);
            hist_indx = hist_number(counter_i,counter_j);

            glob_col_indx = ang_bins*hist_indx + bin_indx;
            glob_row_indx = idx(i,j);

            output_array[glob_row_indx*row_width + glob_col_indx] = float(output_array[glob_row_indx*row_width + glob_col_indx] + float(gradient_mag[idx(pix_i,pix_j)]));


            counter_j = counter_j + 1; 
        }
        counter_i = counter_i + 1;
        counter_j = 0;
    }

}
"""

原文

I'm writing a kernel using PyCUDA. My GPU device only supports compute capability 1.1 (arch sm_11) and so I can only use floats in my code. I've taken great effort to ensure I'm doing everything with floats, but despite that, there is a particular line in my code that keeps causing a compiler error.

The chunk of code is:

  // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height. 
  if( j > 0 && j < im_width && i > 0 && i < im_height){
    gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
  }

Here, idx() is a __device__ helper function that returns a linear index based on pixel indices i and j, and it only works with integers. I use it throughout and it doesn't give errors anywhere else, so I strongly suspect it's not idx(). The sqrt() call is just from the standard C math functions which support floats. All of the arrays involved, x_gradient , y_gradient, and gradient_mag are all float* and they are part of the input to my function (i.e. declared in Python, then converted to device variables, etc.).

I've tried removing the extra cast to float in my code above, with no luck. I've also tried doing something completely stupid like this:

 // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height. 
 if( j > 0 && j < im_width && i > 0 && i < im_height){
    gradient_mag[idx(i,j)] = 3.0f; // also tried float(3.0) here
  }

All of these variations give the same error:

 pycuda.driver.CompileError: nvcc said it demoted types in source code it compiled--this is likely not what you want.
 [command: nvcc --cubin -arch sm_11 -I/usr/local/lib/python2.7/dist-packages/pycuda-2011.1.2-py2.7-linux-x86_64.egg/pycuda/../include/pycuda kernel.cu]
 [stderr:
 ptxas /tmp/tmpxft_00004329_00000000-2_kernel.ptx, line 128; warning : Double is not supported. Demoting to float
 ]

Any ideas? I've debugged many errors in my code and was hoping to get it working tonight, but this has proved to be a bug that I cannot understand.

Added -- Here is a truncated version of the kernel that produces the same error above on my machine.

 every_pixel_hog_kernel_source = \
 """
 #include <math.h>
 #include <stdio.h>

 __device__ int idx(int ii, int jj){
     return gridDim.x*blockDim.x*ii+jj;
 }

 __device__ int bin_number(float angle_val, int total_angles, int num_bins){ 

     float angle1;   
     float min_dist;
     float this_dist;
     int bin_indx;

     angle1 = 0.0;
     min_dist = abs(angle_val - angle1);
     bin_indx = 0;

     for(int kk=1; kk < num_bins; kk++){
         angle1 = angle1 + float(total_angles)/float(num_bins);
         this_dist = abs(angle_val - angle1);
         if(this_dist < min_dist){
             min_dist = this_dist;
             bin_indx = kk;
         }
     }

     return bin_indx;
 }

 __device__ int hist_number(int ii, int jj){

     int hist_num = 0;

     if(jj >= 0 && jj < 11){ 
         if(ii >= 0 && ii < 11){ 
             hist_num = 0;
         }
         else if(ii >= 11 && ii < 22){
             hist_num = 3;
         }
         else if(ii >= 22 && ii < 33){
             hist_num = 6;
         }
     }
     else if(jj >= 11 && jj < 22){
         if(ii >= 0 && ii < 11){ 
             hist_num = 1;
         }
         else if(ii >= 11 && ii < 22){
             hist_num = 4;
         }
         else if(ii >= 22 && ii < 33){
             hist_num = 7;
         }
     }
     else if(jj >= 22 && jj < 33){
         if(ii >= 0 && ii < 11){ 
             hist_num = 2;
         }
         else if(ii >= 11 && ii < 22){
             hist_num = 5;
         }
         else if(ii >= 22 && ii < 33){
             hist_num = 8;
         }
     }

     return hist_num;
 }

  __global__ void every_pixel_hog_kernel(float* input_image, int im_width, int im_height, float* gaussian_array, float* x_gradient, float* y_gradient, float* gradient_mag, float* angles, float* output_array)
  {    
      /////
      // Setup the thread indices and linear offset.
      /////
      int i = blockDim.y * blockIdx.y + threadIdx.y;
      int j = blockDim.x * blockIdx.x + threadIdx.x;
      int ang_limit = 180;
      int ang_bins = 9;
      float pi_val = 3.141592653589f; //91

      /////
      // Compute a Gaussian smoothing of the current pixel and save it into a new image array
      // Use sync threads to make sure everyone does the Gaussian smoothing before moving on.
      /////
      if( j > 1 && i > 1 && j < im_width-2 && i < im_height-2 ){

            // Hard-coded unit standard deviation 5-by-5 Gaussian smoothing filter.
            gaussian_array[idx(i,j)] = float(1.0/273.0) *(
            input_image[idx(i-2,j-2)] + float(4.0)*input_image[idx(i-2,j-1)] + float(7.0)*input_image[idx(i-2,j)] + float(4.0)*input_image[idx(i-2,j+1)] + input_image[idx(i-2,j+2)] + 
            float(4.0)*input_image[idx(i-1,j-2)] + float(16.0)*input_image[idx(i-1,j-1)] + float(26.0)*input_image[idx(i-1,j)] + float(16.0)*input_image[idx(i-1,j+1)] + float(4.0)*input_image[idx(i-1,j+2)] +
            float(7.0)*input_image[idx(i,j-2)] + float(26.0)*input_image[idx(i,j-1)] + float(41.0)*input_image[idx(i,j)] + float(26.0)*input_image[idx(i,j+1)] + float(7.0)*input_image[idx(i,j+2)] +
            float(4.0)*input_image[idx(i+1,j-2)] + float(16.0)*input_image[idx(i+1,j-1)] + float(26.0)*input_image[idx(i+1,j)] + float(16.0)*input_image[idx(i+1,j+1)] + float(4.0)*input_image[idx(i+1,j+2)] +
            input_image[idx(i+2,j-2)] + float(4.0)*input_image[idx(i+2,j-1)] + float(7.0)*input_image[idx(i+2,j)] + float(4.0)*input_image[idx(i+2,j+1)] + input_image[idx(i+2,j+2)]);
     }
     __syncthreads();

     /////
     // Compute the simple x and y gradients of the image and store these into new images
     // again using syncthreads before moving on.
     /////

     // X-gradient, ensure x is between 1 and width-1
     if( j > 0 && j < im_width){
         x_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i,j-1)]);
     }
     else if(j == 0){
         x_gradient[idx(i,j)] = float(0.0);
     }

    // Y-gradient, ensure y is between 1 and height-1
    if( i > 0 && i < im_height){
         y_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i-1,j)]);
    }
    else if(i == 0){
        y_gradient[idx(i,j)] = float(0.0);
    }
    __syncthreads();

    // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height. 
    if( j < im_width && i < im_height){

        gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
    }
    __syncthreads();

    /////
    // Compute the orientation angles
    /////
    if( j < im_width && i < im_height){
        if(ang_limit == 360){
            angles[idx(i,j)] = float((atan2(y_gradient[idx(i,j)],x_gradient[idx(i,j)])+pi_val)*float(180.0)/pi_val);
        }
        else{
            angles[idx(i,j)] = float((atan( y_gradient[idx(i,j)]/x_gradient[idx(i,j)] )+(pi_val/float(2.0)))*float(180.0)/pi_val);
        }
    }
    __syncthreads();

    // Compute the HoG using the above arrays. Do so in a 3x3 grid, with 9 angle bins for each grid.
    // forming an 81-vector and then write this 81 vector as a row in the large output array.

    int top_bound, bot_bound, left_bound, right_bound, offset;
    int window = 32;

    if(i-window/2 > 0){
        top_bound = i-window/2;
        bot_bound = top_bound + window;
    }
    else{
        top_bound = 0;
        bot_bound = top_bound + window;
    }

    if(j-window/2 > 0){
        left_bound = j-window/2;
        right_bound = left_bound + window;
    }
    else{
        left_bound = 0;
        right_bound = left_bound + window;
    }

    if(bot_bound - im_height > 0){
        offset = bot_bound - im_height;
        top_bound = top_bound - offset;
        bot_bound = bot_bound - offset;
    }

    if(right_bound - im_width > 0){
        offset = right_bound - im_width;
        right_bound = right_bound - offset;
        left_bound = left_bound - offset;
    }

    int counter_i = 0;
    int counter_j = 0;
    int bin_indx, hist_indx, glob_col_indx, glob_row_indx;
    int row_width = 81; 

    for(int pix_i = top_bound; pix_i < bot_bound; pix_i++){
        for(int pix_j = left_bound; pix_j < right_bound; pix_j++){

            bin_indx = bin_number(angles[idx(pix_i,pix_j)], ang_limit, ang_bins);
            hist_indx = hist_number(counter_i,counter_j);

            glob_col_indx = ang_bins*hist_indx + bin_indx;
            glob_row_indx = idx(i,j);

            output_array[glob_row_indx*row_width + glob_col_indx] = float(output_array[glob_row_indx*row_width + glob_col_indx] + float(gradient_mag[idx(pix_i,pix_j)]));


            counter_j = counter_j + 1; 
        }
        counter_i = counter_i + 1;
        counter_j = 0;
    }

}
"""

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

凉世弥音 2024-12-25 15:07:58

这是使用双精度数的一个明显的例子：

 gaussian_array[idx(i,j)] = float(1.0/273.0) *

看到双精度数被除了吗？

但实际上，使用浮点文字而不是双文字转换为浮点数 - 转换很丑陋，我建议他们会隐藏这样的错误。

-----编辑 1/Dec---------

首先，感谢@CygnusX1，常量折叠会阻止该计算 - 我什至没有想到这一点。

我尝试重现错误的环境：我安装了 CUDA SDK 3.2（@EMS 提到他们似乎在实验室中使用），编译上面截断的内核版本，实际上 nvopencc 确实优化了上面的计算（感谢@CygnusX1），实际上它没有在生成的 PTX 代码中的任何地方使用双精度数。此外，ptxas 没有给出@EMS 收到的错误。由此，我认为问题出在 every_pixel_hog_kernel_source 代码本身之外，可能在 PyCUDA 中。但是，使用 PyCUDA 2011.1.2 并使用它进行编译仍然不会产生像 @EMS 问题中那样的警告。我可以得到问题中的错误，但是这是通过引入双重计算来实现的，例如从 gaussian_array[idx(i,j)] = float(1.0/273.0) * 中删除强制转换

以获得对于相同的 python 情况，以下是否会产生您的错误：

import pycuda.driver as cuda
from pycuda.compiler import compile

x=compile("""put your truncated kernel code here""",options=[],arch="sm_11",keep=True)

在我的情况下它不会产生错误，因此有可能我根本无法复制您的结果。
不过，我可以给一些建议。使用compile（或SourceModule）时，如果使用keep=True，python将打印出正在生成ptx文件的文件夹在显示错误消息之前。
然后，如果您可以检查在该文件夹中生成的 ptx 文件并查看 .f64 出现的位置，它应该给出一些关于什么被视为双精度的想法 - 但是，破译原始代码中的代码内核很困难 - 拥有产生错误的最简单的示例会对您有所帮助。

Here's an unmistakable case of using doubles:

 gaussian_array[idx(i,j)] = float(1.0/273.0) *

See the double literals being divided?

But really, use float literals instead of double literals cast to floats - the casts are ugly, and I suggest they will hide bugs like this.

-------Edit 1/Dec---------

Firstly, thanks @CygnusX1, constant folding would prevent that calculation - I didn't even think of it.

I've tried to reproduce the environment of the error: I installed the CUDA SDK 3.2 (That @EMS has mentioned they seem to use in the lab), compiling the truncated kernel version above, and indeed nvopencc did optimize the above calculation away (thanks @CygnusX1), and indeed it didn't use doubles anywhere in the generated PTX code. Further, ptxas didn't give the error received by @EMS. From that, I thought the problem is outside of the every_pixel_hog_kernel_source code itself, perhaps in PyCUDA. However, using PyCUDA 2011.1.2 and compiling with that still does not produce a warning like in @EMS's question. I can get the error in the question, however it is by introducing a double calculation, such as removing the cast from gaussian_array[idx(i,j)] = float(1.0/273.0) *

To get to the same python case, does the following produce your error:

import pycuda.driver as cuda
from pycuda.compiler import compile

x=compile("""put your truncated kernel code here""",options=[],arch="sm_11",keep=True)

It doesn't produce an error in my circumstance, so there is a possibility I simply can't replicate your result.
However, I can give some advice. When using compile (or SourceModule), if you use keep=True, python will print out the folder where the ptx file is being generated just before showing the error message.
Then, if you can examine the ptx file generated in that folder and looking where .f64 appears it should give some idea of what is being treated as a double - however, deciphering what code that is in your original kernel is difficult - having the simplest example that produces your error will help you.

回复收藏 0 原文

感性 2024-12-25 15:07:58

你的问题在这里：

angle1 = 0.0;

0.0 是一个双精度常数。 0.0f 是单精度常数。

Your problem is here:

angle1 = 0.0;

0.0 is a double precision constant. 0.0f is a single precision constant.

回复收藏 0 原文

辞旧 2024-12-25 15:07:58

（评论，不是答案，但它太大了，无法将其作为评论）

您能否提供发生错误的行周围的 PTX 代码？

我尝试使用您提供的代码编译一个简单的内核：

__constant__ int im_width;
__constant__ int im_height;

__device__ int idx(int i,int j) {
    return i+j*im_width;
}

__global__ void kernel(float* gradient_mag, float* x_gradient, float* y_gradient) {
    int i = threadIdx.x;
    int j = threadIdx.y;
  // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
  if( j > 0 && j < im_width && i > 0 && i < im_height){
    gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
  }
}

使用：

nvcc.exe -m32 -maxrregcount=32 -gencode=arch=compute_11,code=\"sm_11,compute_11\" --compile -o "Debug\main.cu .obj" main.cu

没有错误。

使用 CUDA 4.1 beta 编译器

更新

我尝试编译您的新代码（我在 CUDA/C++ 中工作，而不是 PyCUDA，但这应该不重要）。也没发现错误啊！使用了 CUDA 4.1 和 CUDA 4.0。
您安装的 CUDA 版本是什么？

C:\>nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2011 NVIDIA Corporation
Built on Wed_Oct_19_23:13:02_PDT_2011
Cuda compilation tools, release 4.1, V0.2.1221

(a comment, not an answer, but it is too big to put it as a comment)

Could you provide the PTX code around the line where the error occurs?

I tried compiling a simple kernel using the code you provided:

__constant__ int im_width;
__constant__ int im_height;

__device__ int idx(int i,int j) {
    return i+j*im_width;
}

__global__ void kernel(float* gradient_mag, float* x_gradient, float* y_gradient) {
    int i = threadIdx.x;
    int j = threadIdx.y;
  // Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
  if( j > 0 && j < im_width && i > 0 && i < im_height){
    gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
  }
}

using:

nvcc.exe -m32 -maxrregcount=32 -gencode=arch=compute_11,code=\"sm_11,compute_11\" --compile -o "Debug\main.cu.obj" main.cu

got no errors.

Using the CUDA 4.1 beta compiler

Update

I tried compiling your new code (I am working within CUDA/C++, not PyCUDA, but this shouldn't matter). Didn't catch the error either! Used CUDA 4.1 and CUDA 4.0.
What is your version of CUDA installation?

C:\>nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2011 NVIDIA Corporation
Built on Wed_Oct_19_23:13:02_PDT_2011
Cuda compilation tools, release 4.1, V0.2.1221

回复收藏 0 原文

~没有更多了~