CUDA 上多项式拟合问题的错误
我尝试使用 CUDA 在设备上做一些简单的循环,但似乎很难理解 Cuda。当我将 CUDA 内核函数与普通 C 代码一起使用时,我从每个函数调用中得到 0。 原始代码:
double evaluate(int D, double tmp[], long *nfeval)
{
/* polynomial fitting problem */
int i, j;
int const M=60;
double px, x=-1, dx=(double)M, result=0;
(*nfeval)++;
dx = 2/dx;
for (i=0;i<=M;i++)
{
px = tmp[0];
for (j=1;j<D;j++)
{
px = x*px + tmp[j];
}
if (px<-1 || px>1) result+=(1-px)*(1-px);
x+=dx;
}
px = tmp[0];
for (j=1;j<D;j++) px=1.2*px+tmp[j];
px = px-72.661;
if (px<0) result+=px*px;
px = tmp[0];
for (j=1;j<D;j++) px=-1.2*px+tmp[j];
px =px-72.661;
if (px<0) result+=px*px;
return result;
}
我想在 CUDA 上做第一个 for 循环:
double evaluate_gpu(int D, double tmp[], long *nfeval)
{
/* polynomial fitting problem */
int j;
int const M=60;
double px, dx=(double)M, result=0;
(*nfeval)++;
dx = 2/dx;
int N = M;
double *device_tmp = NULL;
size_t size_tmp = sizeof tmp;
cudaMalloc((double **) &device_tmp, size_tmp);
cudaMemcpy(device_tmp, tmp, size_tmp, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cEvaluate <<< n_blocks, block_size >>> (device_tmp, result, D);
// cudaMemcpy(result, result, size_result, cudaMemcpyDeviceToHost);
px = tmp[0];
for (j=1;j<D;j++) px=1.2*px+tmp[j];
px = px-72.661;
if (px<0) result+=px*px;
px = tmp[0];
for (j=1;j<D;j++) px=-1.2*px+tmp[j];
px =px-72.661;
if (px<0) result+=px*px;
return result;
}
设备函数如下所示:
__global__ void cEvaluate_temp(double* tmp,double result, int D)
{
int M =60;
double px;
double x=-1;
double dx=(double)M ;
int j;
dx = 2/dx;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < 60) //<==>if (idx < M)
{
px = tmp[0];
for (j=1;j<D;j++)
{
px = x*px + tmp[j];
}
if (px<-1 || px>1)
{ __syncthreads();
result+=(1-px)*(1-px); //+=
}
x+=dx;
}
}
我知道我没有指定问题,但似乎我有不止一个。
我不知道何时将变量复制到设备,以及何时“自动”复制。 现在,我正在使用 CUDA 3.2 并且仿真存在问题(我想使用 printf), 当我使用 make emu=1 运行 NVCC 时,使用 printf 时没有错误,但我也没有得到任何输出。
有一个最简单版本的设备功能,我测试过。谁能解释并行增加结果值后会发生什么?我想我应该使用设备共享内存和同步来执行“+=”之类的操作。
__global__ void cEvaluate(double* tmp,double result, int D)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < 60) //<==>if (idx < M)
{
result+=1;
printf("res = %f ",result); //-deviceemu, make emu=1
}
}
I tried to use CUDA to do some simple loops on device, but it seem that it is hard to understand Cuda. I am getting 0 from every function call, when I use CUDA kernel function with normal C code.
The original code:
double evaluate(int D, double tmp[], long *nfeval)
{
/* polynomial fitting problem */
int i, j;
int const M=60;
double px, x=-1, dx=(double)M, result=0;
(*nfeval)++;
dx = 2/dx;
for (i=0;i<=M;i++)
{
px = tmp[0];
for (j=1;j<D;j++)
{
px = x*px + tmp[j];
}
if (px<-1 || px>1) result+=(1-px)*(1-px);
x+=dx;
}
px = tmp[0];
for (j=1;j<D;j++) px=1.2*px+tmp[j];
px = px-72.661;
if (px<0) result+=px*px;
px = tmp[0];
for (j=1;j<D;j++) px=-1.2*px+tmp[j];
px =px-72.661;
if (px<0) result+=px*px;
return result;
}
I wanted to do first for loop on CUDA:
double evaluate_gpu(int D, double tmp[], long *nfeval)
{
/* polynomial fitting problem */
int j;
int const M=60;
double px, dx=(double)M, result=0;
(*nfeval)++;
dx = 2/dx;
int N = M;
double *device_tmp = NULL;
size_t size_tmp = sizeof tmp;
cudaMalloc((double **) &device_tmp, size_tmp);
cudaMemcpy(device_tmp, tmp, size_tmp, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cEvaluate <<< n_blocks, block_size >>> (device_tmp, result, D);
// cudaMemcpy(result, result, size_result, cudaMemcpyDeviceToHost);
px = tmp[0];
for (j=1;j<D;j++) px=1.2*px+tmp[j];
px = px-72.661;
if (px<0) result+=px*px;
px = tmp[0];
for (j=1;j<D;j++) px=-1.2*px+tmp[j];
px =px-72.661;
if (px<0) result+=px*px;
return result;
}
Where the device function looks like:
__global__ void cEvaluate_temp(double* tmp,double result, int D)
{
int M =60;
double px;
double x=-1;
double dx=(double)M ;
int j;
dx = 2/dx;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < 60) //<==>if (idx < M)
{
px = tmp[0];
for (j=1;j<D;j++)
{
px = x*px + tmp[j];
}
if (px<-1 || px>1)
{ __syncthreads();
result+=(1-px)*(1-px); //+=
}
x+=dx;
}
}
I know that I have not specified the problem, but it seem that I have much more than one.
I do not know when to copy variable to device, and when it will be copied 'automatically'.
Now, I am using CUDA 3.2 and there is problem with emulation (I would like to use printf),
when I run NVCC with make emu=1 , there is no error when I use printf, but I also do not get any output.
There is the simplest version of device function, I tested. Can anybody explain what will happen with result value after incrementing it in parallel ? I think I should use device shared memory and synchronization to do sth like "+=" .
__global__ void cEvaluate(double* tmp,double result, int D)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < 60) //<==>if (idx < M)
{
result+=1;
printf("res = %f ",result); //-deviceemu, make emu=1
}
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
不,变量 result 不在多个线程之间共享。
我的建议是在共享内存中建立一个结果值矩阵,每个线程一个结果,计算每个值并将其减少为单个值。
内核调用后还需要 cudaMemcpy。内核是异步的,需要同步功能。
还可以在每次 CUDA API 调用时使用错误检查函数。
No, the variable result is not shared across multiple threads.
What I would suggest is to have a matrix of result values in shared memory, one result for each thread, compute every value and the reduce it to a single value.
Also you need the cudaMemcpy after the kernel invocation. Kernel are asynchronous and needs a sync function.
Also use the error check functions at each CUDA API invocation.