Cuda-memcheck 未报告越界共享内存访问

发布于 2024-12-22 04:41:32 字数 3045 浏览 2 评论 0原文

我正在使用共享内存运行以下代码：

__global__ void computeAddShared(int *in , int *out, int sizeInput){
        //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
    extern __shared__ float temp[];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int ltid = threadIdx.x;
    temp[ltid] = 0;
    while(tid < sizeInput){
        temp[ltid] += in[tid];
        tid+=gridDim.x * blockDim.x; // to handle array of any size
    }
    __syncthreads();
    int offset = 1;
    while(offset < blockDim.x){
        if(ltid % (offset * 2) == 0){
            temp[ltid] = temp[ltid] + temp[ltid + offset];
        }
        __syncthreads();
        offset*=2;
    }
    if(ltid == 0){
        out[blockIdx.x] = temp[0];
    }
}

int main(){
    
    int size = 16; // size of present input array. Changes after every loop iteration
    int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    /*FILE *f;
    f = fopen("invertedList.txt" , "w");
        a[0] = 1 + (rand() % 8);
        fprintf(f, "%d,",a[0]);
        for( int i = 1 ; i< N; i++){
            a[i] = a[i-1] + (rand() % 8) + 1;
            fprintf(f, "%d,",a[i]);
        }
        fclose(f);*/
    int* gidata;
    int* godata;
    cudaMalloc((void**)&gidata, size* sizeof(int));
    cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice);
    int TPB  = 4;
    int blocks = 10; //to get things kicked off
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    while(blocks != 1 ){
        if(size < TPB){
            TPB  = size; // size is 2^sth
        }
        blocks  = (size+ TPB -1 ) / TPB;
        cudaMalloc((void**)&godata, blocks * sizeof(int));
        computeAddShared<<<blocks, TPB,TPB>>>(gidata, godata,size);
        cudaFree(gidata);
        gidata = godata;
        size = blocks;
    }
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));

    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop);
    printf("time is %f ms", elapsedTime);
    int *output = (int*)malloc(sizeof(int));
    cudaMemcpy(output, gidata, sizeof(int), cudaMemcpyDeviceToHost);
    //Cant free either earlier as both point to same location
    cudaError_t chk = cudaFree(godata);
    if(chk!=0){
        printf("First chk also printed error. Maybe error in my logic\n");
    }
    
    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
    printf("The sum of the array is %d\n", output[0]);
    getchar();
    
    return 0;
}

显然，computeAddShared 中的第一个 while 循环导致了越界错误，因为我向共享内存分配了 4 个字节。为什么 cuda-memcheck 没有捕获到这一点。以下是 cuda-memcheck 的输出

========= CUDA-MEMCHECK
time is 12.334816 msThe error by threadsyn is no errorThe sum of the array is 13
6

========= ERROR SUMMARY: 0 errors

原文

I am running the following code using shared memory:

__global__ void computeAddShared(int *in , int *out, int sizeInput){
        //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
    extern __shared__ float temp[];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int ltid = threadIdx.x;
    temp[ltid] = 0;
    while(tid < sizeInput){
        temp[ltid] += in[tid];
        tid+=gridDim.x * blockDim.x; // to handle array of any size
    }
    __syncthreads();
    int offset = 1;
    while(offset < blockDim.x){
        if(ltid % (offset * 2) == 0){
            temp[ltid] = temp[ltid] + temp[ltid + offset];
        }
        __syncthreads();
        offset*=2;
    }
    if(ltid == 0){
        out[blockIdx.x] = temp[0];
    }
}

int main(){
    
    int size = 16; // size of present input array. Changes after every loop iteration
    int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    /*FILE *f;
    f = fopen("invertedList.txt" , "w");
        a[0] = 1 + (rand() % 8);
        fprintf(f, "%d,",a[0]);
        for( int i = 1 ; i< N; i++){
            a[i] = a[i-1] + (rand() % 8) + 1;
            fprintf(f, "%d,",a[i]);
        }
        fclose(f);*/
    int* gidata;
    int* godata;
    cudaMalloc((void**)&gidata, size* sizeof(int));
    cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice);
    int TPB  = 4;
    int blocks = 10; //to get things kicked off
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    while(blocks != 1 ){
        if(size < TPB){
            TPB  = size; // size is 2^sth
        }
        blocks  = (size+ TPB -1 ) / TPB;
        cudaMalloc((void**)&godata, blocks * sizeof(int));
        computeAddShared<<<blocks, TPB,TPB>>>(gidata, godata,size);
        cudaFree(gidata);
        gidata = godata;
        size = blocks;
    }
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));

    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop);
    printf("time is %f ms", elapsedTime);
    int *output = (int*)malloc(sizeof(int));
    cudaMemcpy(output, gidata, sizeof(int), cudaMemcpyDeviceToHost);
    //Cant free either earlier as both point to same location
    cudaError_t chk = cudaFree(godata);
    if(chk!=0){
        printf("First chk also printed error. Maybe error in my logic\n");
    }
    
    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
    printf("The sum of the array is %d\n", output[0]);
    getchar();
    
    return 0;
}

Clearly, the first while loop in computeAddShared is causing out of bounds error because I am allocating 4 bytes to shared memory. Why does cuda-memcheck not catch this. Below is the output of cuda-memcheck

========= CUDA-MEMCHECK
time is 12.334816 msThe error by threadsyn is no errorThe sum of the array is 13
6

========= ERROR SUMMARY: 0 errors

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

站稳脚跟 2024-12-29 04:41:32

共享内存分配粒度。硬件无疑具有用于分配的页面大小（可能与 L1 缓存线侧相同）。由于每个块只有 4 个线程，单个页面中“意外地”会有足够的共享内存来让您的代码工作。如果您使用了合理数量的线程块（即扭曲大小的整数倍），则会检测到错误，因为没有足够的分配内存。

回复收藏 0 原文

~没有更多了~

关于作者

傲性难收

暂无简介

文章

26 人气

关注发私信

友情链接

文江博客

Cuda-memcheck 未报告越界共享内存访问

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（1）

关于作者

相关话题

热门标签

推荐作者

Promise

qq_lbRlsh

待＂谢繁草

yy2010hell

漫无边际

傲娇萝莉攻

友情链接

Cuda-memcheck 未报告越界共享内存访问

如果你对这篇内容有疑问，欢迎到本站社区发帖提问 参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（1）

关于作者

相关话题

热门标签

推荐作者

Promise

qq_lbRlsh

待＂谢繁草

yy2010hell

漫无边际

傲娇萝莉攻

友情链接

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。