是否保证在不同的GPU上分配的内存区域不会重叠？

发布于 2025-01-23 03:40:16 字数 1891 浏览 0 评论 0 原文

例如，以下C ++代码同时使用 cumeMalloc（）在2个单独的GPU设备上分配2 4GB板。数值地址范围似乎永远不会彼此重叠。这对GPU设备通常在同一主机上保证吗？我无法在文档中提到这一点。（相比之下，我读到的是，CPU和GPU内存地址不能保证这是不保证的。）

这样的实际应用是，因此不需要在给定的GPU存储器位置中同时存储（地址，Device_ID）对。取而代之的是，随着我们跟踪在哪个设备上分配了哪些内存地址范围，因此，仅给出（GPU）内存地址时，我们始终可以推断正确的设备。

#include <iostream>
#include <string>
#include <thread>

#include <cuda.h>

// Allocated memory on different devices appear to not overlap.

CUcontext g_device_contexts[2];

std::string errorMessage(CUresult const result) {
  char const *err;
  cuGetErrorString(result, &err);
  return err;
}

void checkError(CUresult const result, int line) {
  if (result != CUDA_SUCCESS) {
    throw std::runtime_error(errorMessage(result) + " on line " +
                             std::to_string(line));
  }
}

void create_contexts() {
  for (int i = 0; i < 2; ++i) {
    checkError(cuCtxCreate(&g_device_contexts[i], 0, i), __LINE__);
  }
}

void cuda_mem_alloc(int const device_id, CUdeviceptr *device_ptr_ptr) {
  checkError(cuCtxSetCurrent(g_device_contexts[device_id]), __LINE__);
  checkError(cuMemAlloc(device_ptr_ptr, 0x100000000ull), __LINE__);
}

int main() {
  checkError(cuInit(0), __LINE__);
  create_contexts();
  CUdeviceptr device_ptr[2];
  std::thread t1(cuda_mem_alloc, 1, &device_ptr[1]);
  std::thread t0(cuda_mem_alloc, 0, &device_ptr[0]);
  t0.join();
  t1.join();
  std::cout << "device_ptr[0]=" << (void *)device_ptr[0] << std::endl;
  std::cout << "device_ptr[1]=" << (void *)device_ptr[1] << std::endl;
  size_t const diff = device_ptr[0] < device_ptr[1]
                          ? device_ptr[1] - device_ptr[0]
                          : device_ptr[0] - device_ptr[1];
  std::cout << "Absolute diff=" << diff << std::endl;
  return 0;
}

原文

For example the following C++ code concurrently allocates 2 4GB slabs on 2 separate GPU devices using cuMemAlloc(). The numerical address ranges appear to never overlap with each other. Is this guaranteed for GPU devices in general on the same host? I wasn't able to find mention of this in the documentation. (In contrast, I have read that this is not guaranteed for CPU and GPU memory addresses.)

The practical application of this is so that one doesn't need to store both the (address,device_id) pair for a given GPU memory location. Instead, as along as we keep track of what memory address ranges have been allocated on which device, then we can always infer the correct device when given only the (GPU) memory address.

#include <iostream>
#include <string>
#include <thread>

#include <cuda.h>

// Allocated memory on different devices appear to not overlap.

CUcontext g_device_contexts[2];

std::string errorMessage(CUresult const result) {
  char const *err;
  cuGetErrorString(result, &err);
  return err;
}

void checkError(CUresult const result, int line) {
  if (result != CUDA_SUCCESS) {
    throw std::runtime_error(errorMessage(result) + " on line " +
                             std::to_string(line));
  }
}

void create_contexts() {
  for (int i = 0; i < 2; ++i) {
    checkError(cuCtxCreate(&g_device_contexts[i], 0, i), __LINE__);
  }
}

void cuda_mem_alloc(int const device_id, CUdeviceptr *device_ptr_ptr) {
  checkError(cuCtxSetCurrent(g_device_contexts[device_id]), __LINE__);
  checkError(cuMemAlloc(device_ptr_ptr, 0x100000000ull), __LINE__);
}

int main() {
  checkError(cuInit(0), __LINE__);
  create_contexts();
  CUdeviceptr device_ptr[2];
  std::thread t1(cuda_mem_alloc, 1, &device_ptr[1]);
  std::thread t0(cuda_mem_alloc, 0, &device_ptr[0]);
  t0.join();
  t1.join();
  std::cout << "device_ptr[0]=" << (void *)device_ptr[0] << std::endl;
  std::cout << "device_ptr[1]=" << (void *)device_ptr[1] << std::endl;
  size_t const diff = device_ptr[0] < device_ptr[1]
                          ? device_ptr[1] - device_ptr[0]
                          : device_ptr[0] - device_ptr[1];
  std::cout << "Absolute diff=" << diff << std::endl;
  return 0;
}

分享到QQ

分享到微博