CudaMalloc 如何工作?
我正在尝试修改 CUDA SDK 中的 imageDenosing 类,我需要多次重复过滤器,以防捕获时间。但我的代码无法正常工作。
//start
__global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
if(iy != 0 && iy < imageH-1 && ix < imageW)
{
float4 fresult = get_color(image[imageW * iy + ix]);
float4 fresult4 = get_color(image[imageW * (iy+1) + ix]);
float4 fresult5 = get_color(image[imageW * (iy-1) + ix]);
float4 fresult7;
fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25;
fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25;
fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25;
buffer[imageW * iy + ix] =
make_color(fresult7.x,fresult7.y,fresult7.z,0);
}
image[imageW * iy + ix] = buffer[imageW * iy + ix];
//should be use cudaMemcpy, But it fails
}
//extern
extern "C" void
cuda_F1D(TColor *dst, int imageW, int imageH)
{
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y));
Copy<<<grid, threads>>>(dst, imageW, imageH);
size_t size = imageW*imageH*sizeof(TColor);
TColor *host =(TColor*) malloc(size);
TColor *dst2;
//TColor *dst3;
//TColor *d = new TColor(imageW*imageH*sizeof(TColor));
dim3 threads2(imageW,1);
dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1));
*for(int i = 0;i<1;i++)
{
cudaMalloc( (void **)&dst2, size);
cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
//cudaMalloc( (void **)&dst3, imageW*imageH*sizeof(TColor));
//cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2);
//cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost);
cudaFree(dst2);
}*
}
此代码可以工作,但无法同步图像数组。并导致许多同步问题
I am trying to modify the imageDenosing class in CUDA SDK, I need to repeat the filter many time incase to capture the time. But my code doesn't work properly.
//start
__global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
if(iy != 0 && iy < imageH-1 && ix < imageW)
{
float4 fresult = get_color(image[imageW * iy + ix]);
float4 fresult4 = get_color(image[imageW * (iy+1) + ix]);
float4 fresult5 = get_color(image[imageW * (iy-1) + ix]);
float4 fresult7;
fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25;
fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25;
fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25;
buffer[imageW * iy + ix] =
make_color(fresult7.x,fresult7.y,fresult7.z,0);
}
image[imageW * iy + ix] = buffer[imageW * iy + ix];
//should be use cudaMemcpy, But it fails
}
//extern
extern "C" void
cuda_F1D(TColor *dst, int imageW, int imageH)
{
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y));
Copy<<<grid, threads>>>(dst, imageW, imageH);
size_t size = imageW*imageH*sizeof(TColor);
TColor *host =(TColor*) malloc(size);
TColor *dst2;
//TColor *dst3;
//TColor *d = new TColor(imageW*imageH*sizeof(TColor));
dim3 threads2(imageW,1);
dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1));
*for(int i = 0;i<1;i++)
{
cudaMalloc( (void **)&dst2, size);
cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
//cudaMalloc( (void **)&dst3, imageW*imageH*sizeof(TColor));
//cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2);
//cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost);
cudaFree(dst2);
}*
}
This code works, but cant synchronise the array of image. and lead to many synchronise problem
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(3)
您的内核正在异步运行 - 您需要等待它完成,例如
Your kernel is running asynchronously - you need to wait for it to complete, e.g.
当您之前发布相同的问题时,我已经为您回答了这个问题 - 您需要等待内核完成才能再次运行它 -
在内核调用之后添加:。
I already answered this for you when you posted the same question previously - you need to wait for a kernel to complete before running it again - add:
after the kernel call.
该声明
导致了问题。您正在覆盖内核中的输入图像。因此,根据线程执行顺序,您将进一步模糊图像的某些部分。
另外,我不认为
dst 的目的看起来是设备内存,因为您可以在 cuda 内核中访问它。
The statement
is causing the problem. You are overwriting your input image in the kernel. So depending on thread execution order, you would be further blurring parts of the image.
Also, I don't see the purpose of
dst
looks to be device memory since you have access to it in the cuda kernal.