CUDA:在多 GPU 中,内存复制到 GPU 1 的速度较慢
我的公司设置了两个 GTX 295,所以一台服务器中总共有 4 个 GPU,我们有几台服务器。 与 GPU 0、2 和 3 相比,我们的 GPU 1 特别慢,因此我编写了一些速度测试来帮助找到问题的原因。
//#include <stdio.h>
//#include <stdlib.h>
//#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <cutil.h>
__global__ void test_kernel(float *d_data) {
int tid = blockDim.x*blockIdx.x + threadIdx.x;
for (int i=0;i<10000;++i) {
d_data[tid] = float(i*2.2);
d_data[tid] += 3.3;
}
}
int main(int argc, char* argv[])
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
int device = 0; //SELECT GPU HERE
cudaSetDevice(device);
cudaEvent_t start, stop;
unsigned int num_vals = 200000000;
float *h_data = new float[num_vals];
for (int i=0;i<num_vals;++i) {
h_data[i] = float(i);
}
float *d_data = NULL;
float malloc_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_data, sizeof(float)*num_vals);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &malloc_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float mem_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &mem_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float kernel_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
test_kernel<<<1000,256>>>(d_data);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &kernel_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
printf("cudaMalloc took %f ms\n",malloc_timer);
printf("Copy to the GPU took %f ms\n",mem_timer);
printf("Test Kernel took %f ms\n",kernel_timer);
cudaMemcpy(h_data,d_data, sizeof(float)*num_vals,cudaMemcpyDeviceToHost);
delete[] h_data;
return 0;
}
结果是
GPU0 cudaMalloc 花费了 0.908640 毫秒 复制到 GPU 花费了 296.058777 毫秒 测试内核花费了 326.721283 ms
GPU1 cudaMalloc 花费了 0.913568 毫秒 复制到 GPU 花费了 663.182251 毫秒 测试内核花费了 326.710785 ms
GPU2 cudaMalloc 花费了 0.925600 毫秒 复制到 GPU 花费了 296.915039 毫秒 测试内核花费了 327.127930 ms
GPU3 cudaMalloc 花费了 0.920416 毫秒 复制到 GPU 花费了 296.968384 毫秒 测试内核花费了 327.038696 毫秒
如您所见,GPU 的 cudaMemcpy 时间是 GPU1 的两倍。这在我们所有服务器之间都是一致的,GPU1 总是很慢。 有什么想法可能是这样吗? 所有服务器都运行 Windows XP。
My company has a setup of two GTX 295, so a total of 4 GPUs in a server, and we have several servers.
We GPU 1 specifically was slow, in comparison to GPU 0, 2 and 3 so I wrote a little speed test to help find the cause of the problem.
//#include <stdio.h>
//#include <stdlib.h>
//#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <cutil.h>
__global__ void test_kernel(float *d_data) {
int tid = blockDim.x*blockIdx.x + threadIdx.x;
for (int i=0;i<10000;++i) {
d_data[tid] = float(i*2.2);
d_data[tid] += 3.3;
}
}
int main(int argc, char* argv[])
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
int device = 0; //SELECT GPU HERE
cudaSetDevice(device);
cudaEvent_t start, stop;
unsigned int num_vals = 200000000;
float *h_data = new float[num_vals];
for (int i=0;i<num_vals;++i) {
h_data[i] = float(i);
}
float *d_data = NULL;
float malloc_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_data, sizeof(float)*num_vals);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &malloc_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float mem_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &mem_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float kernel_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
test_kernel<<<1000,256>>>(d_data);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &kernel_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
printf("cudaMalloc took %f ms\n",malloc_timer);
printf("Copy to the GPU took %f ms\n",mem_timer);
printf("Test Kernel took %f ms\n",kernel_timer);
cudaMemcpy(h_data,d_data, sizeof(float)*num_vals,cudaMemcpyDeviceToHost);
delete[] h_data;
return 0;
}
The results are
GPU0
cudaMalloc took 0.908640 ms
Copy to the GPU took 296.058777 ms
Test Kernel took 326.721283 ms
GPU1
cudaMalloc took 0.913568 ms
Copy to the GPU took 663.182251 ms
Test Kernel took 326.710785 ms
GPU2
cudaMalloc took 0.925600 ms
Copy to the GPU took 296.915039 ms
Test Kernel took 327.127930 ms
GPU3
cudaMalloc took 0.920416 ms
Copy to the GPU took 296.968384 ms
Test Kernel took 327.038696 ms
As you can see, the cudaMemcpy to the GPU is well double the amount of time for GPU1. This is consistent between all our servers, it is always GPU1 that is slow.
Any ideas why this may be?
All servers are running windows XP.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(4)
这是一个驱动程序问题。更新到最新驱动修复了
This was a driver issue. Updating to the latest driver fixed it
这可能是您的 PCI 总线的问题,请尝试将卡更换到不同的插槽中,看看问题是否仍然存在。如果这是一个问题,请通过更快的插槽将所有数据复制到 gtx295 上,然后使用 sli top 将其复制到另一个(慢速 PCI 总线)GPU 上。
This may be an issue with your pci bus, try swapping the cards into different slots to see if the problem persists. If this is an issue, copy all your data onto the gtx295 via the faster slot and use sli top copy it across to the other (slow pci bus) gpu.
如果您可以利用更快的显卡的 gddr 来加载,那么您可以以更高的带宽进行设备设备传输,这也可能有助于消除该问题。另外,请使用 NVidia 的带宽测试来检查您的带宽,以获得一些物理结果和测试。
祝你好运!
If you can utilized the faster video card's gddr to load, then you can do a device device tansfer at much MUCH higher bandwidth, that might help eliminate the issue also. Also, check your bandwidth with NVidia's bandwidth testing to get some physical results and test.
Good luck!
您是否在双处理器设置中运行?当前 Tylersburg 芯片组中存在一个错误,即 x86 (0) 到 GPU (1) 的路径带宽比从 x86 (0) 到 GPU (0) 的直接路径慢。英特尔应该发布新版本来修复这个错误。尝试使用任务集将测试进程锁定到特定的 CPU,看看会得到什么结果。
问候
标记
Are you running in a dual processor setup? There is a bug in the current Tylersburg chipsets such that the bandwidth of the path x86 (0) to GPU (1) is slower than the direct path from x86 (0) to GPU (0). Intel should release a new version to fix this bug. Try locking your test process to a specific CPU using taskset and see what results you get.
regards
Mark