CUDA - 将设备数据复制到主机?
我有设备变量,在这个变量中,我在设备中分配并填充一个数组,但在将数据获取到主机时遇到问题。 cudaMemcpy()
返回 cudaErrorInvalidValue
错误。我该怎么办?
PS:代码只是示例,我知道,在这种特殊情况下,我可以使用 cudaMalloc 因为我知道数组的大小,但在我的真实代码中,它计算数组的大小设备,它需要立即分配内存。
PS2:我发现了类似的问题,但我还是不知道,如何解决? - 从设备复制设备中分配的数据主机
PS3:我已经更新了代码,但仍然无法工作:{
PS4:我只是尝试在具有 Nvidia GT 520MX(最新游戏驱动程序)的笔记本上运行此代码,但也无法工作: (
谢谢
#include <cuda.h>
#include <stdio.h>
#define N 400
__device__ int* d_array;
__global__ void allocDeviceMemory()
{
d_array = new int[N];
for(int i=0; i < N; i++)
d_array[i] = 123;
}
int main()
{
allocDeviceMemory<<<1, 1>>>();
cudaDeviceSynchronize();
int* d_a = NULL;
cudaMemcpyFromSymbol((void**)&d_a, "d_array", sizeof(d_a), 0, cudaMemcpyDeviceToHost);
printf("gpu adress: %lld\n", d_a);
int* h_array = (int*)malloc(N*sizeof(int));
cudaError_t errr = cudaMemcpy(h_array, d_a, N*sizeof(int), cudaMemcpyDeviceToHost);
printf("h_array: %d, %d\n", h_array[0], errr);
getchar();
return 0;
}
I have device variable and in this variable, I allocate and fill an array in the device, but I have a problem to get data to host. cudaMemcpy()
return cudaErrorInvalidValue
error. how can I do it?
PS: The Code is just example, I know, that In this particular case I can use cudaMalloc
because I know the size of the array, but In my REAL code, It computes the size of the array in the device and it needs immediately allocate memory.
PS2: I found a similar problem, but I still don't know, how can I solve it? - copy data which is allocated in device from device to host
PS3: I have updated code, but still doesn't work:{
PS4: I am just trying to run this code on a notebook with Nvidia GT 520MX(latest game driver) and doesn't work too :(
thx
#include <cuda.h>
#include <stdio.h>
#define N 400
__device__ int* d_array;
__global__ void allocDeviceMemory()
{
d_array = new int[N];
for(int i=0; i < N; i++)
d_array[i] = 123;
}
int main()
{
allocDeviceMemory<<<1, 1>>>();
cudaDeviceSynchronize();
int* d_a = NULL;
cudaMemcpyFromSymbol((void**)&d_a, "d_array", sizeof(d_a), 0, cudaMemcpyDeviceToHost);
printf("gpu adress: %lld\n", d_a);
int* h_array = (int*)malloc(N*sizeof(int));
cudaError_t errr = cudaMemcpy(h_array, d_a, N*sizeof(int), cudaMemcpyDeviceToHost);
printf("h_array: %d, %d\n", h_array[0], errr);
getchar();
return 0;
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
启动内核后需要同步(
cudaDeviceSynchronize()
)来分配内存。您还可以检查同步和所有其他 CUDA API 调用的返回值吗?
You need to synchronize (
cudaDeviceSynchronize()
) after launching the kernel to allocate the memory.Can you also check the return value of the sync and all other CUDA API calls?
我已经测试了你的代码,这里没有错误。我正在运行 CUDA 4.0。
i have tested your code and there is no error here. I am running CUDA 4.0.