当前位置：文江博客话题详情

cuda c 程序中得到错误的值

发布于 2024-12-09 15:18:38 字数 3459 浏览 0 评论 0原文

我正在尝试在 cuda C 中模拟矩阵乘法。除了输出之外，一切都正确。

这是我的程序：

#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <conio.h>
#define     N       4
#define TILE_WIDTH 2

__global__ void MatMul(int*A, int* B, int* C) {  

    int sum; 
    int idx = threadIdx.x; 
    int idy = threadIdx.y; 
    int bx = blockIdx.x; 
    int by = blockIdx.y; 
    int k ,uidx , uidy , i; 
    uidx = bx*TILE_WIDTH + idx;
    uidy = by*TILE_WIDTH + idy; 
    sum = 0;


    // Allocating memory in shared memory

    __shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
    __shared__ int temp2[TILE_WIDTH][TILE_WIDTH];

    //copying the data to shared memory 

    for( i =0;i<N/TILE_WIDTH; i++) 
    { 
        temp1[idy][idx] = A[uidy * N + ((i*TILE_WIDTH)+uidx)%N]; 
        temp2[idy][idx] = B[(i*TILE_WIDTH+uidy * N)%N + uidx]; 
        __syncthreads();

        // multiplying matrices in shared memory 

        for(k=0 ; k < TILE_WIDTH;k++) {
            sum = sum + temp1[idy][k]*temp2[k][idx];
        }
    }

    // synchronizing the threads 

    __syncthreads(); 
    C[uidy*N + uidx] = sum;
}

int main( void ) {

    int a[N][N], b[N][N], c[N][N];     //host copies of a,b,c

    int *dev_a, *dev_b, *dev_c;        //device copies of a,b,c

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
    cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
    cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );

    // fill the matrices 'a' and 'b' on the CPU

    for (int i=0; i<N; i++) {
        for (int j=0; j < N; j++) {
            a[i][j] = j+3;
            b[i][j] = i+6;
        }
    }
    //copy above a,b values to device

    cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
    // Prepare timer
    cudaEvent_t start, stop;
    float time;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //start record
    cudaEventRecord(start, 0);

    // Kernel invocation with N threads 
    dim3 dimGrid(2,2,1); 
    dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
    MatMul<<<dimGrid , dimBlock>>> (dev_a, dev_b, dev_c);

    //stop record
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    //this is operation time
    cudaEventElapsedTime(&time, start, stop);

    //clean up      
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    //copy result to host
    cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );

    //output..
    for (int i=0; i < N; i++){
        for (int j=0; j < N; j++)
            printf( "%d ", a[i][j]);
        printf ("  ");
        for (int j=0; j < N; j++)
            printf( "%d ", b[i][j]);
        printf ("  =  ");
        for (int j=0; j < N; j++)
            printf( "%d ", c[i][j]);
        printf ("\n");
    }



    //free the allocated memory in device
    cudaFree( dev_a );
    cudaFree( dev_b );
    cudaFree( dev_c );
    printf("\n multiplication done!!!\n");
    printf("\n");
    printf(" time elapsed in ms=%f\n",time);
    getch();
    return 0;
}

这是我的输出：

3 4 5 6     6 6 6 6         108 108 115 115
3 4 5 6     7 7 7 7         108 108 115 115
3 4 5 6     8 8 8 8         108 108 115 115
3 4 5 6     9 9 9 9         108 108 115 115

它显示错误的值。请告诉我程序中的任何错误。我对 CUDA C 很陌生。

I am trying to simulate matrix multiplication in cuda C. Everything is correct except the output.

This is my program:

#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <conio.h>
#define     N       4
#define TILE_WIDTH 2

__global__ void MatMul(int*A, int* B, int* C) {  

    int sum; 
    int idx = threadIdx.x; 
    int idy = threadIdx.y; 
    int bx = blockIdx.x; 
    int by = blockIdx.y; 
    int k ,uidx , uidy , i; 
    uidx = bx*TILE_WIDTH + idx;
    uidy = by*TILE_WIDTH + idy; 
    sum = 0;


    // Allocating memory in shared memory

    __shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
    __shared__ int temp2[TILE_WIDTH][TILE_WIDTH];

    //copying the data to shared memory 

    for( i =0;i<N/TILE_WIDTH; i++) 
    { 
        temp1[idy][idx] = A[uidy * N + ((i*TILE_WIDTH)+uidx)%N]; 
        temp2[idy][idx] = B[(i*TILE_WIDTH+uidy * N)%N + uidx]; 
        __syncthreads();

        // multiplying matrices in shared memory 

        for(k=0 ; k < TILE_WIDTH;k++) {
            sum = sum + temp1[idy][k]*temp2[k][idx];
        }
    }

    // synchronizing the threads 

    __syncthreads(); 
    C[uidy*N + uidx] = sum;
}

int main( void ) {

    int a[N][N], b[N][N], c[N][N];     //host copies of a,b,c

    int *dev_a, *dev_b, *dev_c;        //device copies of a,b,c

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
    cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
    cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );

    // fill the matrices 'a' and 'b' on the CPU

    for (int i=0; i<N; i++) {
        for (int j=0; j < N; j++) {
            a[i][j] = j+3;
            b[i][j] = i+6;
        }
    }
    //copy above a,b values to device

    cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
    // Prepare timer
    cudaEvent_t start, stop;
    float time;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //start record
    cudaEventRecord(start, 0);

    // Kernel invocation with N threads 
    dim3 dimGrid(2,2,1); 
    dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
    MatMul<<<dimGrid , dimBlock>>> (dev_a, dev_b, dev_c);

    //stop record
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    //this is operation time
    cudaEventElapsedTime(&time, start, stop);

    //clean up      
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    //copy result to host
    cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );

    //output..
    for (int i=0; i < N; i++){
        for (int j=0; j < N; j++)
            printf( "%d ", a[i][j]);
        printf ("  ");
        for (int j=0; j < N; j++)
            printf( "%d ", b[i][j]);
        printf ("  =  ");
        for (int j=0; j < N; j++)
            printf( "%d ", c[i][j]);
        printf ("\n");
    }



    //free the allocated memory in device
    cudaFree( dev_a );
    cudaFree( dev_b );
    cudaFree( dev_c );
    printf("\n multiplication done!!!\n");
    printf("\n");
    printf(" time elapsed in ms=%f\n",time);
    getch();
    return 0;
}

And this is my output:

3 4 5 6     6 6 6 6         108 108 115 115
3 4 5 6     7 7 7 7         108 108 115 115
3 4 5 6     8 8 8 8         108 108 115 115
3 4 5 6     9 9 9 9         108 108 115 115

It is showing wrong values. Please tell me any error in my program. I'm very new to CUDA C.

收藏 0

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

评论（2）

柒夜笙歌凉 2024-12-16 15:18:38

虽然我不知道你的程序出了什么问题，但我认为你应该能够使用更简单的矩阵更好地诊断它。您是否尝试过将两个单位矩阵相乘？或者全为1。对各种简单矩阵的重复测试应该可以证明细胞是如何组合的。

最终，我认为您会发现使用 TILE_WIDTH 的方式存在问题，但我不能确定。

回复收藏 0 原文

彻夜缠绵 2024-12-16 15:18:38

这应该可以修复它（在i循环中）：

temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];

This should fix it (in the i loop):

temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];

回复收藏 0 原文

~没有更多了~

关于作者

暂无简介

0 文章

0 评论

23 人气

关注发私信

相关话题

热门标签

操作系统程序设计 IT运维 Linux系统管理 JavaScript 服务器应用 solaris C/C++ PHP Shell BSD Vue.js aix Oracle Python HTML 系统管理 HTML5 CSS 前端

推荐作者

已经忘了多久

文章 0 评论 0

15867725375

文章 0 评论 0

LonelySnow

文章 0 评论 0

走过海棠暮

文章 0 评论 0

轻许诺言

文章 0 评论 0

信馬由缰

文章 0 评论 0

友情链接

我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的隐私政策了解更多相关信息。单击 接受 或继续使用网站，即表示您同意使用 Cookies 和您的相关数据。

原文