CUDA“未指定的启动失败”访问内存
我想做的事情很简单。 每个线程从存储在全局内存中的全局数组中读取子数组。 然后它进行一些计算并将结果存储在静态数组中。 最后输出被存储回全局内存中的另一个数组中 当我注释将静态数组写入全局数组的行时,内核将运行。如代码所示。 有什么想法吗?
GPU内核:
#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_
#include <stdio.h>
__device__ void
DecompressBlockGPU(unsigned char *compressed_block,unsigned char *compressed_size,
int array_length,unsigned char *decompressed_block)
{
int j = 0;
for(int i = 0 ; i < array_length ;i++)
{
for(int idx = 0 ; idx < compressed_size[i]; idx++)
{
decompressed_block[j] = compressed_block[i];
j++;
}
}
}
__global__ void
gpu_test(unsigned char *compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,
unsigned char *output, int BlockSize,int BlockWidth,int BlockHeight,
int cols,int xTB,int yTB,int xTH,int yTH,unsigned char *aux_array)
{
int x_max = xBlocks ;
int y_max = yBlocks ;
int x_block = blockIdx.x ;
int y_block = blockIdx.y ;
x_max = gridDim.x*blockDim.x ;
y_max = gridDim.y*blockDim.y ;
x_block = (blockIdx.x*xTH);
y_block = (blockIdx.y*yTH);
int x_block1 = x_block + threadIdx.x;
int y_block1 = y_block + threadIdx.y;
int block_idx = y_block1*xBlocks + x_block1;
unsigned char *temp_ptr = compressed_data + OffsetsArray[block_idx];
int *array_length = (int *)temp_ptr;
unsigned char *compressed_size = compressed_data + OffsetsArray[block_idx] +
array_length[0] +sizeof(int)/sizeof(unsigned char);
unsigned char *compressed_block = compressed_data + OffsetsArray[block_idx] +
sizeof(int)/sizeof(unsigned char);
aux_array = aux_array + (BlockWidth+2)*(BlockHeight+2)*block_idx;
aux_array[block_idx]=array_length[0];
unsigned char decompressed_block[72];
unsigned char extracted_block[32];
DecompressBlockGPU(compressed_block,compressed_size,array_length[0],
&decompressed_block[0]);
if(block_idx == 0)
{
for(int i=0;i<16;i++) aux_array[i]= decompressed_block[i]; //fails
for(int i=16;i<16*36;i++) aux_array[i]=1;//works
}
}
#endif
CPU功能:
unsigned char *runGPU(unsigned char *d_compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,unsigned char *h_output)
{
printf("xBlocks =%d yBlocks =%d \n",xBlocks,yBlocks);
int xTB = 4;
int yTB = 4;
int xTH = 1;
int yTH = 1;
unsigned char *d_output;
unsigned char *d_aux_array;
unsigned char *h_aux_array;
int mem_size = image_len*sizeof(unsigned char);
int big_mem_size = sizeof(unsigned char)*xBlocks*yBlocks*(BlockWidth+2)*(BlockHeight+2);
cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));
cutilSafeCall( cudaMalloc( (void**) &d_aux_array,big_mem_size));
h_aux_array = (unsigned char *)malloc(big_mem_size);
float time = 0;
float totalTime = 0;
cudaEvent_t start_event4, stop_event4;
cutilSafeCall( cudaEventCreate(&start_event4) );
cutilSafeCall( cudaEventCreate(&stop_event4) );
cutilSafeCall( cudaEventRecord(start_event4, 0) );
dim3 grid(xTB,yTB, 1);
dim3 threads( xTH, yTH, 1);
gpu_test<<<grid,threads>>>(d_compressed_data,OffsetsArray,xBlocks,yBlocks,d_output,BlockSize,BlockWidth,BlockHeight,cols,xTB,yTB,xTH,yTH,d_aux_array);
cudaThreadSynchronize();
cutilSafeCall( cudaEventRecord(stop_event4, 0) );
cutilSafeCall( cudaEventSynchronize(stop_event4) );
time = 0;
cutilSafeCall( cudaEventElapsedTime(&time, start_event4, stop_event4));
totalTime += time;
totalTime /= (1.0e3 * 1);
shrLogEx(LOGBOTH | MASTER, 0, "GPU decompression Time = %.5f \n",totalTime);
cutilSafeCall(cudaMemcpy(h_output,d_output, mem_size, cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(h_aux_array,d_aux_array, big_mem_size, cudaMemcpyDeviceToHost));
cudaFree(d_output);
cudaFree(d_aux_array);
return h_aux_array;
}
现在清楚了吗?(编辑后)
what i'm trying to do is very simple.
each thread reads sub-array from global array stored in the global memory.
then it do some calculations and store the result in static array.
at last the output is stored back in another array in global memory
when i comment the line which writes the static array to the global array the kernel run.as shown in the code.
any ideas?
GPU kernel :
#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_
#include <stdio.h>
__device__ void
DecompressBlockGPU(unsigned char *compressed_block,unsigned char *compressed_size,
int array_length,unsigned char *decompressed_block)
{
int j = 0;
for(int i = 0 ; i < array_length ;i++)
{
for(int idx = 0 ; idx < compressed_size[i]; idx++)
{
decompressed_block[j] = compressed_block[i];
j++;
}
}
}
__global__ void
gpu_test(unsigned char *compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,
unsigned char *output, int BlockSize,int BlockWidth,int BlockHeight,
int cols,int xTB,int yTB,int xTH,int yTH,unsigned char *aux_array)
{
int x_max = xBlocks ;
int y_max = yBlocks ;
int x_block = blockIdx.x ;
int y_block = blockIdx.y ;
x_max = gridDim.x*blockDim.x ;
y_max = gridDim.y*blockDim.y ;
x_block = (blockIdx.x*xTH);
y_block = (blockIdx.y*yTH);
int x_block1 = x_block + threadIdx.x;
int y_block1 = y_block + threadIdx.y;
int block_idx = y_block1*xBlocks + x_block1;
unsigned char *temp_ptr = compressed_data + OffsetsArray[block_idx];
int *array_length = (int *)temp_ptr;
unsigned char *compressed_size = compressed_data + OffsetsArray[block_idx] +
array_length[0] +sizeof(int)/sizeof(unsigned char);
unsigned char *compressed_block = compressed_data + OffsetsArray[block_idx] +
sizeof(int)/sizeof(unsigned char);
aux_array = aux_array + (BlockWidth+2)*(BlockHeight+2)*block_idx;
aux_array[block_idx]=array_length[0];
unsigned char decompressed_block[72];
unsigned char extracted_block[32];
DecompressBlockGPU(compressed_block,compressed_size,array_length[0],
&decompressed_block[0]);
if(block_idx == 0)
{
for(int i=0;i<16;i++) aux_array[i]= decompressed_block[i]; //fails
for(int i=16;i<16*36;i++) aux_array[i]=1;//works
}
}
#endif
CPU functions :
unsigned char *runGPU(unsigned char *d_compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,unsigned char *h_output)
{
printf("xBlocks =%d yBlocks =%d \n",xBlocks,yBlocks);
int xTB = 4;
int yTB = 4;
int xTH = 1;
int yTH = 1;
unsigned char *d_output;
unsigned char *d_aux_array;
unsigned char *h_aux_array;
int mem_size = image_len*sizeof(unsigned char);
int big_mem_size = sizeof(unsigned char)*xBlocks*yBlocks*(BlockWidth+2)*(BlockHeight+2);
cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));
cutilSafeCall( cudaMalloc( (void**) &d_aux_array,big_mem_size));
h_aux_array = (unsigned char *)malloc(big_mem_size);
float time = 0;
float totalTime = 0;
cudaEvent_t start_event4, stop_event4;
cutilSafeCall( cudaEventCreate(&start_event4) );
cutilSafeCall( cudaEventCreate(&stop_event4) );
cutilSafeCall( cudaEventRecord(start_event4, 0) );
dim3 grid(xTB,yTB, 1);
dim3 threads( xTH, yTH, 1);
gpu_test<<<grid,threads>>>(d_compressed_data,OffsetsArray,xBlocks,yBlocks,d_output,BlockSize,BlockWidth,BlockHeight,cols,xTB,yTB,xTH,yTH,d_aux_array);
cudaThreadSynchronize();
cutilSafeCall( cudaEventRecord(stop_event4, 0) );
cutilSafeCall( cudaEventSynchronize(stop_event4) );
time = 0;
cutilSafeCall( cudaEventElapsedTime(&time, start_event4, stop_event4));
totalTime += time;
totalTime /= (1.0e3 * 1);
shrLogEx(LOGBOTH | MASTER, 0, "GPU decompression Time = %.5f \n",totalTime);
cutilSafeCall(cudaMemcpy(h_output,d_output, mem_size, cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(h_aux_array,d_aux_array, big_mem_size, cudaMemcpyDeviceToHost));
cudaFree(d_output);
cudaFree(d_aux_array);
return h_aux_array;
}
is it clear now ?(after editing)
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
尝试通过 cuda-memcheck 运行您的程序(或者启用内存检查,如果您正在使用 Parallel Nsight)。
Try running your program through cuda-memcheck (or enable memory checking if you are using Parallel Nsight).