CUDA“未指定的启动失败”访问内存

发布于 2024-12-04 19:29:59 字数 4135 浏览 0 评论 0原文

我想做的事情很简单。每个线程从存储在全局内存中的全局数组中读取子数组。然后它进行一些计算并将结果存储在静态数组中。最后输出被存储回全局内存中的另一个数组中当我注释将静态数组写入全局数组的行时，内核将运行。如代码所示。有什么想法吗？

GPU内核：

#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_

#include <stdio.h>

__device__  void
DecompressBlockGPU(unsigned char *compressed_block,unsigned char *compressed_size,
                    int array_length,unsigned char *decompressed_block)
{       
    int j = 0;

    for(int i = 0 ; i < array_length ;i++)
    {
        for(int idx = 0 ; idx < compressed_size[i]; idx++)
        {
            decompressed_block[j] = compressed_block[i];
            j++;
        }
    }
}
__global__ void

gpu_test(unsigned char *compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,
        unsigned char *output, int BlockSize,int BlockWidth,int BlockHeight,
        int cols,int xTB,int yTB,int xTH,int yTH,unsigned char *aux_array)
{
    int x_max = xBlocks ;
    int y_max = yBlocks ;

    int x_block = blockIdx.x ; 
    int y_block = blockIdx.y ;

    x_max = gridDim.x*blockDim.x ;
    y_max = gridDim.y*blockDim.y ;

    x_block = (blockIdx.x*xTH); 
    y_block = (blockIdx.y*yTH);
    int x_block1 = x_block + threadIdx.x;
    int y_block1 = y_block + threadIdx.y;

    int block_idx = y_block1*xBlocks + x_block1;
    unsigned char *temp_ptr = compressed_data + OffsetsArray[block_idx];        
    int *array_length = (int *)temp_ptr;
    unsigned char *compressed_size = compressed_data + OffsetsArray[block_idx] + 
                               array_length[0] +sizeof(int)/sizeof(unsigned char);
    unsigned char *compressed_block = compressed_data + OffsetsArray[block_idx] + 
                               sizeof(int)/sizeof(unsigned char);

    aux_array = aux_array + (BlockWidth+2)*(BlockHeight+2)*block_idx;
    aux_array[block_idx]=array_length[0];

    unsigned char decompressed_block[72];
    unsigned char extracted_block[32];

    DecompressBlockGPU(compressed_block,compressed_size,array_length[0],
                             &decompressed_block[0]);

    if(block_idx == 0)
    {
        for(int i=0;i<16;i++) aux_array[i]= decompressed_block[i]; //fails  
        for(int i=16;i<16*36;i++) aux_array[i]=1;//works
    }
}
#endif

CPU功能：

unsigned char *runGPU(unsigned char *d_compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,unsigned char *h_output)
{


    printf("xBlocks =%d yBlocks =%d  \n",xBlocks,yBlocks);



    int xTB = 4;
    int yTB = 4;
    int xTH = 1;
    int yTH = 1; 



    unsigned char *d_output;
    unsigned char *d_aux_array;
    unsigned char *h_aux_array;

    int mem_size = image_len*sizeof(unsigned char);
    int big_mem_size = sizeof(unsigned char)*xBlocks*yBlocks*(BlockWidth+2)*(BlockHeight+2);

    cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));
    cutilSafeCall( cudaMalloc( (void**) &d_aux_array,big_mem_size));
    h_aux_array = (unsigned char *)malloc(big_mem_size);


    float time = 0;
    float totalTime = 0;
    cudaEvent_t start_event4, stop_event4;
    cutilSafeCall( cudaEventCreate(&start_event4) );
    cutilSafeCall( cudaEventCreate(&stop_event4) );
    cutilSafeCall( cudaEventRecord(start_event4, 0) );

    dim3 grid(xTB,yTB, 1);
    dim3 threads( xTH, yTH, 1);

    gpu_test<<<grid,threads>>>(d_compressed_data,OffsetsArray,xBlocks,yBlocks,d_output,BlockSize,BlockWidth,BlockHeight,cols,xTB,yTB,xTH,yTH,d_aux_array);
    cudaThreadSynchronize();

    cutilSafeCall( cudaEventRecord(stop_event4, 0) );
    cutilSafeCall( cudaEventSynchronize(stop_event4) );
    time = 0;
    cutilSafeCall( cudaEventElapsedTime(&time, start_event4, stop_event4));
    totalTime += time;
    totalTime /= (1.0e3 * 1);
    shrLogEx(LOGBOTH | MASTER, 0, "GPU decompression Time = %.5f \n",totalTime); 

    cutilSafeCall(cudaMemcpy(h_output,d_output, mem_size, cudaMemcpyDeviceToHost));
    cutilSafeCall(cudaMemcpy(h_aux_array,d_aux_array, big_mem_size, cudaMemcpyDeviceToHost));


    cudaFree(d_output);
    cudaFree(d_aux_array);

    return h_aux_array;

}

现在清楚了吗？（编辑后）

原文

what i'm trying to do is very simple.
each thread reads sub-array from global array stored in the global memory.
then it do some calculations and store the result in static array.
at last the output is stored back in another array in global memory
when i comment the line which writes the static array to the global array the kernel run.as shown in the code.
any ideas?

GPU kernel :

#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_

#include <stdio.h>

__device__  void
DecompressBlockGPU(unsigned char *compressed_block,unsigned char *compressed_size,
                    int array_length,unsigned char *decompressed_block)
{       
    int j = 0;

    for(int i = 0 ; i < array_length ;i++)
    {
        for(int idx = 0 ; idx < compressed_size[i]; idx++)
        {
            decompressed_block[j] = compressed_block[i];
            j++;
        }
    }
}
__global__ void

gpu_test(unsigned char *compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,
        unsigned char *output, int BlockSize,int BlockWidth,int BlockHeight,
        int cols,int xTB,int yTB,int xTH,int yTH,unsigned char *aux_array)
{
    int x_max = xBlocks ;
    int y_max = yBlocks ;

    int x_block = blockIdx.x ; 
    int y_block = blockIdx.y ;

    x_max = gridDim.x*blockDim.x ;
    y_max = gridDim.y*blockDim.y ;

    x_block = (blockIdx.x*xTH); 
    y_block = (blockIdx.y*yTH);
    int x_block1 = x_block + threadIdx.x;
    int y_block1 = y_block + threadIdx.y;

    int block_idx = y_block1*xBlocks + x_block1;
    unsigned char *temp_ptr = compressed_data + OffsetsArray[block_idx];        
    int *array_length = (int *)temp_ptr;
    unsigned char *compressed_size = compressed_data + OffsetsArray[block_idx] + 
                               array_length[0] +sizeof(int)/sizeof(unsigned char);
    unsigned char *compressed_block = compressed_data + OffsetsArray[block_idx] + 
                               sizeof(int)/sizeof(unsigned char);

    aux_array = aux_array + (BlockWidth+2)*(BlockHeight+2)*block_idx;
    aux_array[block_idx]=array_length[0];

    unsigned char decompressed_block[72];
    unsigned char extracted_block[32];

    DecompressBlockGPU(compressed_block,compressed_size,array_length[0],
                             &decompressed_block[0]);

    if(block_idx == 0)
    {
        for(int i=0;i<16;i++) aux_array[i]= decompressed_block[i]; //fails  
        for(int i=16;i<16*36;i++) aux_array[i]=1;//works
    }
}
#endif

CPU functions :

unsigned char *runGPU(unsigned char *d_compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,unsigned char *h_output)
{


    printf("xBlocks =%d yBlocks =%d  \n",xBlocks,yBlocks);



    int xTB = 4;
    int yTB = 4;
    int xTH = 1;
    int yTH = 1; 



    unsigned char *d_output;
    unsigned char *d_aux_array;
    unsigned char *h_aux_array;

    int mem_size = image_len*sizeof(unsigned char);
    int big_mem_size = sizeof(unsigned char)*xBlocks*yBlocks*(BlockWidth+2)*(BlockHeight+2);

    cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));
    cutilSafeCall( cudaMalloc( (void**) &d_aux_array,big_mem_size));
    h_aux_array = (unsigned char *)malloc(big_mem_size);


    float time = 0;
    float totalTime = 0;
    cudaEvent_t start_event4, stop_event4;
    cutilSafeCall( cudaEventCreate(&start_event4) );
    cutilSafeCall( cudaEventCreate(&stop_event4) );
    cutilSafeCall( cudaEventRecord(start_event4, 0) );

    dim3 grid(xTB,yTB, 1);
    dim3 threads( xTH, yTH, 1);

    gpu_test<<<grid,threads>>>(d_compressed_data,OffsetsArray,xBlocks,yBlocks,d_output,BlockSize,BlockWidth,BlockHeight,cols,xTB,yTB,xTH,yTH,d_aux_array);
    cudaThreadSynchronize();

    cutilSafeCall( cudaEventRecord(stop_event4, 0) );
    cutilSafeCall( cudaEventSynchronize(stop_event4) );
    time = 0;
    cutilSafeCall( cudaEventElapsedTime(&time, start_event4, stop_event4));
    totalTime += time;
    totalTime /= (1.0e3 * 1);
    shrLogEx(LOGBOTH | MASTER, 0, "GPU decompression Time = %.5f \n",totalTime); 

    cutilSafeCall(cudaMemcpy(h_output,d_output, mem_size, cudaMemcpyDeviceToHost));
    cutilSafeCall(cudaMemcpy(h_aux_array,d_aux_array, big_mem_size, cudaMemcpyDeviceToHost));


    cudaFree(d_output);
    cudaFree(d_aux_array);

    return h_aux_array;

}

is it clear now ?(after editing)

分享到QQ

分享到微博