CudaMalloc 如何工作?

发布于 2024-09-04 05:24:35 字数 2078 浏览 1 评论 0原文

我正在尝试修改 CUDA SDK 中的 imageDenosing 类,我需要多次重复过滤器,以防捕获时间。但我的代码无法正常工作。

//start

__global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer)
{  

const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;

    if(iy != 0 && iy < imageH-1  && ix < imageW)
    {

        float4 fresult = get_color(image[imageW * iy + ix]);
        float4 fresult4 = get_color(image[imageW * (iy+1) + ix]);
        float4 fresult5 = get_color(image[imageW * (iy-1) + ix]);

        float4 fresult7; 
            fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25;
            fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25;
            fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25;

        buffer[imageW * iy + ix] =      
            make_color(fresult7.x,fresult7.y,fresult7.z,0);     

    }

    image[imageW * iy + ix] =   buffer[imageW * iy + ix];
    //should be use cudaMemcpy, But it fails
}

//extern

extern "C" void
cuda_F1D(TColor *dst, int imageW, int imageH)
{
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y));
Copy<<<grid, threads>>>(dst, imageW, imageH);

size_t size = imageW*imageH*sizeof(TColor);
TColor *host =(TColor*) malloc(size);
TColor *dst2;
//TColor *dst3;
//TColor *d = new TColor(imageW*imageH*sizeof(TColor));
dim3 threads2(imageW,1);
dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1));

*for(int i = 0;i<1;i++)
{   
    cudaMalloc( (void **)&dst2, size);
            cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
//cudaMalloc( (void **)&dst3, imageW*imageH*sizeof(TColor));
//cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
    F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2);
//cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost);
    cudaFree(dst2);
}*

}

此代码可以工作,但无法同步图像数组。并导致许多同步问题

I am trying to modify the imageDenosing class in CUDA SDK, I need to repeat the filter many time incase to capture the time. But my code doesn't work properly.

//start

__global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer)
{  

const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;

    if(iy != 0 && iy < imageH-1  && ix < imageW)
    {

        float4 fresult = get_color(image[imageW * iy + ix]);
        float4 fresult4 = get_color(image[imageW * (iy+1) + ix]);
        float4 fresult5 = get_color(image[imageW * (iy-1) + ix]);

        float4 fresult7; 
            fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25;
            fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25;
            fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25;

        buffer[imageW * iy + ix] =      
            make_color(fresult7.x,fresult7.y,fresult7.z,0);     

    }

    image[imageW * iy + ix] =   buffer[imageW * iy + ix];
    //should be use cudaMemcpy, But it fails
}

//extern

extern "C" void
cuda_F1D(TColor *dst, int imageW, int imageH)
{
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y));
Copy<<<grid, threads>>>(dst, imageW, imageH);

size_t size = imageW*imageH*sizeof(TColor);
TColor *host =(TColor*) malloc(size);
TColor *dst2;
//TColor *dst3;
//TColor *d = new TColor(imageW*imageH*sizeof(TColor));
dim3 threads2(imageW,1);
dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1));

*for(int i = 0;i<1;i++)
{   
    cudaMalloc( (void **)&dst2, size);
            cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
//cudaMalloc( (void **)&dst3, imageW*imageH*sizeof(TColor));
//cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
    F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2);
//cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost);
    cudaFree(dst2);
}*

}

This code works, but cant synchronise the array of image. and lead to many synchronise problem

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

神魇的王 2024-09-11 05:24:35

您的内核正在异步运行 - 您需要等待它完成,例如

cudaMalloc((void **)&dst2, size);
cudaMemcpy(dst2, dst, imageW * imageH * sizeof(TColor), cudaMemcpyHostToDevice);
F1D<<<grid2, threads2>>>(dst, imageW, imageH, dst2);
cudaThreadSynchronize(); // *** wait for kernel to complete ***
cudaFree(dst2);

Your kernel is running asynchronously - you need to wait for it to complete, e.g.

cudaMalloc((void **)&dst2, size);
cudaMemcpy(dst2, dst, imageW * imageH * sizeof(TColor), cudaMemcpyHostToDevice);
F1D<<<grid2, threads2>>>(dst, imageW, imageH, dst2);
cudaThreadSynchronize(); // *** wait for kernel to complete ***
cudaFree(dst2);
时光与爱终年不遇 2024-09-11 05:24:35

当您之前发布相同的问题时,我已经为您回答了这个问题 - 您需要等待内核完成才能再次运行它 -

cudaThreadSynchronize(); // *** wait for kernel to complete ***

在内核调用之后添加:。

I already answered this for you when you posted the same question previously - you need to wait for a kernel to complete before running it again - add:

cudaThreadSynchronize(); // *** wait for kernel to complete ***

after the kernel call.

云朵有点甜 2024-09-11 05:24:35

该声明

image[imageW * iy + ix] =   buffer[imageW * iy + ix];

导致了问题。您正在覆盖内核中的输入图像。因此,根据线程执行顺序,您将进一步模糊图像的某些部分。

另外,我不认为

cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);

dst 的目的看起来是设备内存,因为您可以在 cuda 内核中访问它。

The statement

image[imageW * iy + ix] =   buffer[imageW * iy + ix];

is causing the problem. You are overwriting your input image in the kernel. So depending on thread execution order, you would be further blurring parts of the image.

Also, I don't see the purpose of

cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);

dst looks to be device memory since you have access to it in the cuda kernal.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文