将数据从设备复制到主机不起作用

发布于 2024-12-07 15:51:05 字数 2299 浏览 0 评论 0原文

我在 Windows 7 x64 上使用 vs2010,并在我的大学项目中使用 CUDA 工具包 v4.0。我想实现一个简单的 gpu-vs-cpu 测试,大部分已经完成,但我的 cuda 测试都没有返回任何结果。我已经使用调试器检查了内存,设备内存包含我需要的所有内容,只有内存复制失败。

host_vector<int> addWithCuda(host_vector<int> h_a, host_vector<int> h_b)
{
int size = h_a.size();
host_vector<int> h_c(size);

// Choose which GPU to run on, change this on a multi-GPU system.
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    return h_c;
}
else{
    // Allocate GPU buffers for three vectors (two input, one output).
    // Copy input vectors from host memory to GPU buffers.
    device_vector<int> d_c=h_c;
    device_vector<int> d_a=h_a;
    device_vector<int> d_b=h_b;

    int*d_a_ptr = raw_pointer_cast(&d_a[0]);
    int*d_b_ptr = raw_pointer_cast(&d_b[0]);
    int*d_c_ptr = raw_pointer_cast(&d_c[0]);
    int*h_c_ptr = raw_pointer_cast(&h_c[0]);

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(d_c_ptr, d_a_ptr, d_b_ptr);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        return h_c;
    }
    thrust::device_vector<int>::iterator d_it;
    thrust::host_vector<int>::iterator h_it;
    // Copy output vector from GPU buffer to host memory.
    h_c=d_c;
    printf("||Debug h_c[0]=%d\td_c[0]=%d\n",h_c[0],d_c[0]);
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
}
return h_c;
}

请注意代码行“h_c=d_c;”。简而言之,这应该将数据从 d_c(设备向量)复制到 h_c(主机向量)。该行不会失败,但也无法正确执行。 h_c 始终保持全 0。

我尝试过其他几种方法,例如

thrust::copy(d_c.begin(),d_c.end(),h_c.begin()); 

cudaMemcpy(h_c_ptr,d_c_ptr,size*sizeof(int),cudaMemcpyDeviceToHost);

甚至

for(int i=0;i < size;++i)h_c[i]=d_c[i];

没有任何效果。我在这里迷路了。

有人有类似的事情吗?感谢所有帮助。

i'm using vs2010 on windows 7 x64 and the CUDA toolkit v4.0 for my university project. I'd like to acheive a simple gpu-vs-cpu test, most of it is done, but none of my cuda tests return any results. I've checked the memory with the debugger and the device memory contained everything I needed, only the memory copying failed.

host_vector<int> addWithCuda(host_vector<int> h_a, host_vector<int> h_b)
{
int size = h_a.size();
host_vector<int> h_c(size);

// Choose which GPU to run on, change this on a multi-GPU system.
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    return h_c;
}
else{
    // Allocate GPU buffers for three vectors (two input, one output).
    // Copy input vectors from host memory to GPU buffers.
    device_vector<int> d_c=h_c;
    device_vector<int> d_a=h_a;
    device_vector<int> d_b=h_b;

    int*d_a_ptr = raw_pointer_cast(&d_a[0]);
    int*d_b_ptr = raw_pointer_cast(&d_b[0]);
    int*d_c_ptr = raw_pointer_cast(&d_c[0]);
    int*h_c_ptr = raw_pointer_cast(&h_c[0]);

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(d_c_ptr, d_a_ptr, d_b_ptr);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        return h_c;
    }
    thrust::device_vector<int>::iterator d_it;
    thrust::host_vector<int>::iterator h_it;
    // Copy output vector from GPU buffer to host memory.
    h_c=d_c;
    printf("||Debug h_c[0]=%d\td_c[0]=%d\n",h_c[0],d_c[0]);
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
}
return h_c;
}

Note the code line "h_c=d_c;". In thrust this was supposed to copy data from d_c(a device vector) to h_c(a host vector). This line doesn't fail, but doesn't execute correctly either. The h_c remains all 0 all the way.

I've tried several other methods like

thrust::copy(d_c.begin(),d_c.end(),h_c.begin()); 

or

cudaMemcpy(h_c_ptr,d_c_ptr,size*sizeof(int),cudaMemcpyDeviceToHost);

or even

for(int i=0;i < size;++i)h_c[i]=d_c[i];

Nothing worked. I'm lost here.

Anyone had anything similar? All help apreciated.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

佼人 2024-12-14 15:51:05

您只创建了“h_c”,但尚未初始化“h_c”。我认为这就是问题所在。没有内存复制问题

You only create "h_c", but haven't initialized "h_c". I think that is the problem. No the memory copy problem

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文