CudaMalloc 如何工作？

发布于 2024-09-03 15:42:26 字数 2078 浏览 3 评论 0原文

我正在尝试修改 CUDA SDK 中的 imageDenosing 类，我需要多次重复过滤器，以防捕获时间。但我的代码无法正常工作。

//start

__global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer)
{  

const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;

    if(iy != 0 && iy < imageH-1  && ix < imageW)
    {

        float4 fresult = get_color(image[imageW * iy + ix]);
        float4 fresult4 = get_color(image[imageW * (iy+1) + ix]);
        float4 fresult5 = get_color(image[imageW * (iy-1) + ix]);

        float4 fresult7; 
            fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25;
            fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25;
            fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25;

        buffer[imageW * iy + ix] =      
            make_color(fresult7.x,fresult7.y,fresult7.z,0);     

    }

    image[imageW * iy + ix] =   buffer[imageW * iy + ix];
    //should be use cudaMemcpy, But it fails
}

//extern

extern "C" void
cuda_F1D(TColor *dst, int imageW, int imageH)
{
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y));
Copy<<<grid, threads>>>(dst, imageW, imageH);

size_t size = imageW*imageH*sizeof(TColor);
TColor *host =(TColor*) malloc(size);
TColor *dst2;
//TColor *dst3;
//TColor *d = new TColor(imageW*imageH*sizeof(TColor));
dim3 threads2(imageW,1);
dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1));

*for(int i = 0;i<1;i++)
{   
    cudaMalloc( (void **)&dst2, size);
            cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
//cudaMalloc( (void **)&dst3, imageW*imageH*sizeof(TColor));
//cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
    F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2);
//cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost);
    cudaFree(dst2);
}*

}

此代码可以工作，但无法同步图像数组。并导致许多同步问题

原文

I am trying to modify the imageDenosing class in CUDA SDK, I need to repeat the filter many time incase to capture the time. But my code doesn't work properly.

//start

__global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer)
{  

const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;

    if(iy != 0 && iy < imageH-1  && ix < imageW)
    {

        float4 fresult = get_color(image[imageW * iy + ix]);
        float4 fresult4 = get_color(image[imageW * (iy+1) + ix]);
        float4 fresult5 = get_color(image[imageW * (iy-1) + ix]);

        float4 fresult7; 
            fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25;
            fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25;
            fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25;

        buffer[imageW * iy + ix] =      
            make_color(fresult7.x,fresult7.y,fresult7.z,0);     

    }

    image[imageW * iy + ix] =   buffer[imageW * iy + ix];
    //should be use cudaMemcpy, But it fails
}

//extern

extern "C" void
cuda_F1D(TColor *dst, int imageW, int imageH)
{
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y));
Copy<<<grid, threads>>>(dst, imageW, imageH);

size_t size = imageW*imageH*sizeof(TColor);
TColor *host =(TColor*) malloc(size);
TColor *dst2;
//TColor *dst3;
//TColor *d = new TColor(imageW*imageH*sizeof(TColor));
dim3 threads2(imageW,1);
dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1));

*for(int i = 0;i<1;i++)
{   
    cudaMalloc( (void **)&dst2, size);
            cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
//cudaMalloc( (void **)&dst3, imageW*imageH*sizeof(TColor));
//cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
    F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2);
//cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost);
    cudaFree(dst2);
}*

}

This code works, but cant synchronise the array of image. and lead to many synchronise problem

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

夜深人未静 2024-09-10 15:42:26

您的内核正在异步运行 - 您需要等待它完成，例如

cudaMalloc((void **)&dst2, size);
cudaMemcpy(dst2, dst, imageW * imageH * sizeof(TColor), cudaMemcpyHostToDevice);
F1D<<<grid2, threads2>>>(dst, imageW, imageH, dst2);
cudaThreadSynchronize(); // *** wait for kernel to complete ***
cudaFree(dst2);

Your kernel is running asynchronously - you need to wait for it to complete, e.g.

cudaMalloc((void **)&dst2, size);
cudaMemcpy(dst2, dst, imageW * imageH * sizeof(TColor), cudaMemcpyHostToDevice);
F1D<<<grid2, threads2>>>(dst, imageW, imageH, dst2);
cudaThreadSynchronize(); // *** wait for kernel to complete ***
cudaFree(dst2);

回复收藏 0 原文

于我来说 2024-09-10 15:42:26

当您之前发布相同的问题时，我已经为您回答了这个问题 - 您需要等待内核完成才能再次运行它 -

cudaThreadSynchronize(); // *** wait for kernel to complete ***

在内核调用之后添加：。

I already answered this for you when you posted the same question previously - you need to wait for a kernel to complete before running it again - add:

cudaThreadSynchronize(); // *** wait for kernel to complete ***

after the kernel call.

回复收藏 0 原文

ゝ偶尔ゞ 2024-09-10 15:42:26

该声明

image[imageW * iy + ix] =   buffer[imageW * iy + ix];

导致了问题。您正在覆盖内核中的输入图像。因此，根据线程执行顺序，您将进一步模糊图像的某些部分。

另外，我不认为

cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);

dst 的目的看起来是设备内存，因为您可以在 cuda 内核中访问它。

The statement

image[imageW * iy + ix] =   buffer[imageW * iy + ix];

is causing the problem. You are overwriting your input image in the kernel. So depending on thread execution order, you would be further blurring parts of the image.

Also, I don't see the purpose of

cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);

dst looks to be device memory since you have access to it in the cuda kernal.

回复收藏 0 原文

~没有更多了~

关于作者

昇り龍

暂无简介

0 文章

0 评论

22 人气

关注发私信

謌踐踏愛綪

文章 0 评论 0

关注

开始看清了

文章 0 评论 0

关注

高速公鹿

文章 0 评论 0

关注

alipaysp_PLnULTzf66

文章 0 评论 0

关注

热情消退

文章 0 评论 0

关注

白色月光

文章 0 评论 0

友情链接

文江博客

CudaMalloc 如何工作？

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（3）

关于作者

相关话题

热门标签

推荐作者

謌踐踏愛綪

开始看清了

高速公鹿

alipaysp_PLnULTzf66

热情消退

白色月光

友情链接

CudaMalloc 如何工作？

如果你对这篇内容有疑问，欢迎到本站社区发帖提问 参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（3）

关于作者

相关话题

热门标签

推荐作者

謌踐踏愛綪

开始看清了

高速公鹿

alipaysp_PLnULTzf66

热情消退

白色月光

友情链接

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。