CUDA：在多 GPU 中，内存复制到 GPU 1 的速度较慢

发布于 2024-08-27 12:46:59 字数 2917 浏览 5 评论 0原文

我的公司设置了两个 GTX 295，所以一台服务器中总共有 4 个 GPU，我们有几台服务器。与 GPU 0、2 和 3 相比，我们的 GPU 1 特别慢，因此我编写了一些速度测试来帮助找到问题的原因。

//#include <stdio.h>
//#include <stdlib.h>
//#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <cutil.h>

__global__ void test_kernel(float *d_data) {
    int tid = blockDim.x*blockIdx.x + threadIdx.x;
    for (int i=0;i<10000;++i) {
        d_data[tid] = float(i*2.2);
        d_data[tid] += 3.3;
    }
}

int main(int argc, char* argv[])
{

    int deviceCount;                                                         
    cudaGetDeviceCount(&deviceCount);
    int device = 0; //SELECT GPU HERE
    cudaSetDevice(device);


    cudaEvent_t start, stop;
    unsigned int num_vals = 200000000;
    float *h_data = new float[num_vals];
    for (int i=0;i<num_vals;++i) {
        h_data[i] = float(i);
    }

    float *d_data = NULL;
    float malloc_timer;
    cudaEventCreate(&start);
    cudaEventCreate(&stop); cudaEventRecord( start, 0 );
    cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
    cudaMalloc((void**)&d_data, sizeof(float)*num_vals);
    cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &malloc_timer, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    float mem_timer;
    cudaEventCreate(&start);
    cudaEventCreate(&stop); cudaEventRecord( start, 0 );
    cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
    cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &mem_timer, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    float kernel_timer;
    cudaEventCreate(&start);
    cudaEventCreate(&stop); cudaEventRecord( start, 0 );
    test_kernel<<<1000,256>>>(d_data);
    cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &kernel_timer, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    printf("cudaMalloc took %f ms\n",malloc_timer);
    printf("Copy to the GPU took %f ms\n",mem_timer);
    printf("Test Kernel took %f ms\n",kernel_timer);

    cudaMemcpy(h_data,d_data, sizeof(float)*num_vals,cudaMemcpyDeviceToHost);

    delete[] h_data;
    return 0;
}

结果是

GPU0 cudaMalloc 花费了 0.908640 毫秒复制到 GPU 花费了 296.058777 毫秒测试内核花费了 326.721283 ms

GPU1 cudaMalloc 花费了 0.913568 毫秒复制到 GPU 花费了 663.182251 毫秒 测试内核花费了 326.710785 ms

GPU2 cudaMalloc 花费了 0.925600 毫秒复制到 GPU 花费了 296.915039 毫秒测试内核花费了 327.127930 ms

GPU3 cudaMalloc 花费了 0.920416 毫秒复制到 GPU 花费了 296.968384 毫秒测试内核花费了 327.038696 毫秒

如您所见，GPU 的 cudaMemcpy 时间是 GPU1 的两倍。这在我们所有服务器之间都是一致的，GPU1 总是很慢。有什么想法可能是这样吗？所有服务器都运行 Windows XP。

原文

My company has a setup of two GTX 295, so a total of 4 GPUs in a server, and we have several servers.
We GPU 1 specifically was slow, in comparison to GPU 0, 2 and 3 so I wrote a little speed test to help find the cause of the problem.

//#include <stdio.h>
//#include <stdlib.h>
//#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <cutil.h>

__global__ void test_kernel(float *d_data) {
    int tid = blockDim.x*blockIdx.x + threadIdx.x;
    for (int i=0;i<10000;++i) {
        d_data[tid] = float(i*2.2);
        d_data[tid] += 3.3;
    }
}

int main(int argc, char* argv[])
{

    int deviceCount;                                                         
    cudaGetDeviceCount(&deviceCount);
    int device = 0; //SELECT GPU HERE
    cudaSetDevice(device);


    cudaEvent_t start, stop;
    unsigned int num_vals = 200000000;
    float *h_data = new float[num_vals];
    for (int i=0;i<num_vals;++i) {
        h_data[i] = float(i);
    }

    float *d_data = NULL;
    float malloc_timer;
    cudaEventCreate(&start);
    cudaEventCreate(&stop); cudaEventRecord( start, 0 );
    cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
    cudaMalloc((void**)&d_data, sizeof(float)*num_vals);
    cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &malloc_timer, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    float mem_timer;
    cudaEventCreate(&start);
    cudaEventCreate(&stop); cudaEventRecord( start, 0 );
    cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
    cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &mem_timer, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    float kernel_timer;
    cudaEventCreate(&start);
    cudaEventCreate(&stop); cudaEventRecord( start, 0 );
    test_kernel<<<1000,256>>>(d_data);
    cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &kernel_timer, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    printf("cudaMalloc took %f ms\n",malloc_timer);
    printf("Copy to the GPU took %f ms\n",mem_timer);
    printf("Test Kernel took %f ms\n",kernel_timer);

    cudaMemcpy(h_data,d_data, sizeof(float)*num_vals,cudaMemcpyDeviceToHost);

    delete[] h_data;
    return 0;
}

The results are

GPU0
cudaMalloc took 0.908640 ms
Copy to the GPU took 296.058777 ms
Test Kernel took 326.721283 ms

GPU1
cudaMalloc took 0.913568 ms
Copy to the GPU took 663.182251 ms
Test Kernel took 326.710785 ms

GPU2
cudaMalloc took 0.925600 ms
Copy to the GPU took 296.915039 ms
Test Kernel took 327.127930 ms

GPU3
cudaMalloc took 0.920416 ms
Copy to the GPU took 296.968384 ms
Test Kernel took 327.038696 ms

As you can see, the cudaMemcpy to the GPU is well double the amount of time for GPU1. This is consistent between all our servers, it is always GPU1 that is slow.
Any ideas why this may be?
All servers are running windows XP.

分享到QQ

分享到微博