cuda c 程序中得到错误的值
我正在尝试在 cuda C 中模拟矩阵乘法。除了输出之外,一切都正确。
这是我的程序:
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <conio.h>
#define N 4
#define TILE_WIDTH 2
__global__ void MatMul(int*A, int* B, int* C) {
int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bx*TILE_WIDTH + idx;
uidy = by*TILE_WIDTH + idy;
sum = 0;
// Allocating memory in shared memory
__shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
__shared__ int temp2[TILE_WIDTH][TILE_WIDTH];
//copying the data to shared memory
for( i =0;i<N/TILE_WIDTH; i++)
{
temp1[idy][idx] = A[uidy * N + ((i*TILE_WIDTH)+uidx)%N];
temp2[idy][idx] = B[(i*TILE_WIDTH+uidy * N)%N + uidx];
__syncthreads();
// multiplying matrices in shared memory
for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];
}
}
// synchronizing the threads
__syncthreads();
C[uidy*N + uidx] = sum;
}
int main( void ) {
int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c
int *dev_a, *dev_b, *dev_c; //device copies of a,b,c
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );
// fill the matrices 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a[i][j] = j+3;
b[i][j] = i+6;
}
}
//copy above a,b values to device
cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//start record
cudaEventRecord(start, 0);
// Kernel invocation with N threads
dim3 dimGrid(2,2,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<<dimGrid , dimBlock>>> (dev_a, dev_b, dev_c);
//stop record
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//this is operation time
cudaEventElapsedTime(&time, start, stop);
//clean up
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );
//output..
for (int i=0; i < N; i++){
for (int j=0; j < N; j++)
printf( "%d ", a[i][j]);
printf (" ");
for (int j=0; j < N; j++)
printf( "%d ", b[i][j]);
printf (" = ");
for (int j=0; j < N; j++)
printf( "%d ", c[i][j]);
printf ("\n");
}
//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf("\n multiplication done!!!\n");
printf("\n");
printf(" time elapsed in ms=%f\n",time);
getch();
return 0;
}
这是我的输出:
3 4 5 6 6 6 6 6 108 108 115 115
3 4 5 6 7 7 7 7 108 108 115 115
3 4 5 6 8 8 8 8 108 108 115 115
3 4 5 6 9 9 9 9 108 108 115 115
它显示错误的值。请告诉我程序中的任何错误。我对 CUDA C 很陌生。
I am trying to simulate matrix multiplication in cuda C. Everything is correct except the output.
This is my program:
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <conio.h>
#define N 4
#define TILE_WIDTH 2
__global__ void MatMul(int*A, int* B, int* C) {
int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bx*TILE_WIDTH + idx;
uidy = by*TILE_WIDTH + idy;
sum = 0;
// Allocating memory in shared memory
__shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
__shared__ int temp2[TILE_WIDTH][TILE_WIDTH];
//copying the data to shared memory
for( i =0;i<N/TILE_WIDTH; i++)
{
temp1[idy][idx] = A[uidy * N + ((i*TILE_WIDTH)+uidx)%N];
temp2[idy][idx] = B[(i*TILE_WIDTH+uidy * N)%N + uidx];
__syncthreads();
// multiplying matrices in shared memory
for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];
}
}
// synchronizing the threads
__syncthreads();
C[uidy*N + uidx] = sum;
}
int main( void ) {
int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c
int *dev_a, *dev_b, *dev_c; //device copies of a,b,c
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );
// fill the matrices 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a[i][j] = j+3;
b[i][j] = i+6;
}
}
//copy above a,b values to device
cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//start record
cudaEventRecord(start, 0);
// Kernel invocation with N threads
dim3 dimGrid(2,2,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<<dimGrid , dimBlock>>> (dev_a, dev_b, dev_c);
//stop record
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//this is operation time
cudaEventElapsedTime(&time, start, stop);
//clean up
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );
//output..
for (int i=0; i < N; i++){
for (int j=0; j < N; j++)
printf( "%d ", a[i][j]);
printf (" ");
for (int j=0; j < N; j++)
printf( "%d ", b[i][j]);
printf (" = ");
for (int j=0; j < N; j++)
printf( "%d ", c[i][j]);
printf ("\n");
}
//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf("\n multiplication done!!!\n");
printf("\n");
printf(" time elapsed in ms=%f\n",time);
getch();
return 0;
}
And this is my output:
3 4 5 6 6 6 6 6 108 108 115 115
3 4 5 6 7 7 7 7 108 108 115 115
3 4 5 6 8 8 8 8 108 108 115 115
3 4 5 6 9 9 9 9 108 108 115 115
It is showing wrong values. Please tell me any error in my program. I'm very new to CUDA C.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
虽然我不知道你的程序出了什么问题,但我认为你应该能够使用更简单的矩阵更好地诊断它。您是否尝试过将两个单位矩阵相乘?或者全为1。对各种简单矩阵的重复测试应该可以证明细胞是如何组合的。
最终,我认为您会发现使用 TILE_WIDTH 的方式存在问题,但我不能确定。
While I don't know what is wrong with your program, I think you should be able to diagnose it better using simpler matrices. Have you tried multiplying two Identity matrices? Or filled with all 1s. Repeated tests with various simple matrices should demonstrate how the cells are being combined.
Ultimately, I think you'll find a problem with the way you use TILE_WIDTH, but I cannot be sure.
这应该可以修复它(在i循环中):
This should fix it (in the i loop):