指定结束位时 cub::DeviceRadixSort 失败
我正在使用 CUB 库的 GPU 基数排序算法对 N 个 32 位无符号整数进行排序,这些整数的值都仅使用 32 位中的 k 个,从最低有效位开始。
因此,我在调用 cub::DeviceRadixSort:: 时指定位子范围 [begin_bit, end_bit)排序键 中希望能够提高排序性能。我使用的是最新版本的 CUB (1.16.0)。
然而,当尝试对具有某些指定位范围 [begin_bit=0, end_bit=k) 和 k = {20,19, 18},例如 ./cub_sort_test 1000000000 0 20
我在带有 CUDA 的 Volta 和 Ampere NVIDIA GPU 上对此进行了测试分别为 11.4 和 11.2 版本。有没有人以前遇到过这个问题,和/或知道解决方法?这是最小的、可重现的示例代码:
// HOW TO BUILD: nvcc -O3 -std=c++17 -Xcompiler -fopenmp cub_sort_test.cu -o cub_sort_test
#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <parallel/algorithm>
#include <random>
#include <vector>
#include <iostream>
#define DEBUG
#ifdef DEBUG
#define CheckCudaError(instruction) \
{ AssertNoCudaError((instruction), __FILE__, __LINE__); }
#else
#define CheckCudaError(instruction) instruction
#endif
inline void AssertNoCudaError(cudaError_t error_code, const char* file, int line) {
if (error_code != cudaSuccess) {
std::cout << "Error: " << cudaGetErrorString(error_code) << " " << file << " " << line << "\n";
}
}
template <typename T>
using PinnedHostVector = thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;
std::mt19937 SeedRandomGenerator(uint32_t distribution_seed) {
const size_t seeds_bytes = sizeof(std::mt19937::result_type) * std::mt19937::state_size;
const size_t seeds_length = seeds_bytes / sizeof(std::seed_seq::result_type);
std::vector<std::seed_seq::result_type> seeds(seeds_length);
std::generate(seeds.begin(), seeds.end(), [&]() {
distribution_seed = (distribution_seed << 1) | (distribution_seed >> (-1 & 31));
return distribution_seed;
});
std::seed_seq seed_sequence(seeds.begin(), seeds.end());
return std::mt19937{seed_sequence};
}
int main(int argc, char* argv[]) {
if (argc != 4) {
std::cerr << "Usage: ./cub-sort-test <num_keys> <gpu_id> <bit_entropy>" << std::endl;
return -1;
}
size_t num_keys = std::stoull(argv[1]);
int gpu = std::stoi(argv[2]);
size_t bit_entropy = std::stoi(argv[3]);
cudaStream_t stream;
CheckCudaError(cudaSetDevice(gpu));
CheckCudaError(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
PinnedHostVector<uint32_t> keys(num_keys);
#pragma omp parallel num_threads(64)
{
uint32_t max = (1 << bit_entropy) - 1;
if (bit_entropy == sizeof(uint32_t) * 8) {
max = std::numeric_limits<uint32_t>::max();
} else if (bit_entropy == 1) {
max = 2;
}
std::mt19937 random_generator = SeedRandomGenerator(2147483647 + static_cast<size_t>(omp_get_thread_num()));
std::uniform_real_distribution<double> uniform_dist(0, max);
#pragma omp for schedule(static)
for (size_t i = 0; i < num_keys; ++i) {
keys[i] = static_cast<uint32_t>(uniform_dist(random_generator));
}
}
thrust::device_vector<uint32_t> device_vector(num_keys);
thrust::copy(keys.begin(), keys.end(), device_vector.begin());
CheckCudaError(cudaDeviceSynchronize());
size_t num_temporary_bytes = 0;
cub::DeviceRadixSort::SortKeys(
NULL, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
thrust::raw_pointer_cast(device_vector.data()), num_keys, 0, bit_entropy + 1, stream); // bit subrange is [begin_bit, end_bit), thus bit_entropy + 1
uint8_t* temporary_storage = nullptr;
CheckCudaError(cudaMalloc(reinterpret_cast<void**>(&temporary_storage), num_temporary_bytes));
cub::DeviceRadixSort::SortKeys(
(void*)temporary_storage, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
thrust::raw_pointer_cast(device_vector.data()), num_keys, 0, bit_entropy + 1, stream);
CheckCudaError(cudaStreamSynchronize(stream));
thrust::copy(device_vector.begin(), device_vector.end(), keys.begin());
CheckCudaError(cudaFree(temporary_storage));
if (std::is_sorted(keys.begin(), keys.end()) == false) {
std::cout << "Error: Sorting failed." << std::endl;
}
return 0;
}
I am using the GPU radix sort algorithm of the CUB library to sort N 32-bit unsigned integers whose values all utilize only k of their 32 bits, starting from the least significant bit.
Thus, I specify the bit subrange [begin_bit, end_bit) when calling cub::DeviceRadixSort::SortKeys in hopes of improving the sorting performance. I am using the latest release of CUB (1.16.0).
However, SortKeys crashes (not deterministically, but almost always) and reports an illegal memory access error when trying to sort 1 billion keys with certain specified bit ranges of [begin_bit=0, end_bit=k), and k = {20,19,18}, e.g. ./cub_sort_test 1000000000 0 20
I tested this on a Volta and an Ampere NVIDIA GPU with CUDA versions 11.4 and 11.2 respectively. Has anyone encountered this previously, and/or know a fix? Here is the minimal, reproducable example code:
// HOW TO BUILD: nvcc -O3 -std=c++17 -Xcompiler -fopenmp cub_sort_test.cu -o cub_sort_test
#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <parallel/algorithm>
#include <random>
#include <vector>
#include <iostream>
#define DEBUG
#ifdef DEBUG
#define CheckCudaError(instruction) \
{ AssertNoCudaError((instruction), __FILE__, __LINE__); }
#else
#define CheckCudaError(instruction) instruction
#endif
inline void AssertNoCudaError(cudaError_t error_code, const char* file, int line) {
if (error_code != cudaSuccess) {
std::cout << "Error: " << cudaGetErrorString(error_code) << " " << file << " " << line << "\n";
}
}
template <typename T>
using PinnedHostVector = thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;
std::mt19937 SeedRandomGenerator(uint32_t distribution_seed) {
const size_t seeds_bytes = sizeof(std::mt19937::result_type) * std::mt19937::state_size;
const size_t seeds_length = seeds_bytes / sizeof(std::seed_seq::result_type);
std::vector<std::seed_seq::result_type> seeds(seeds_length);
std::generate(seeds.begin(), seeds.end(), [&]() {
distribution_seed = (distribution_seed << 1) | (distribution_seed >> (-1 & 31));
return distribution_seed;
});
std::seed_seq seed_sequence(seeds.begin(), seeds.end());
return std::mt19937{seed_sequence};
}
int main(int argc, char* argv[]) {
if (argc != 4) {
std::cerr << "Usage: ./cub-sort-test <num_keys> <gpu_id> <bit_entropy>" << std::endl;
return -1;
}
size_t num_keys = std::stoull(argv[1]);
int gpu = std::stoi(argv[2]);
size_t bit_entropy = std::stoi(argv[3]);
cudaStream_t stream;
CheckCudaError(cudaSetDevice(gpu));
CheckCudaError(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
PinnedHostVector<uint32_t> keys(num_keys);
#pragma omp parallel num_threads(64)
{
uint32_t max = (1 << bit_entropy) - 1;
if (bit_entropy == sizeof(uint32_t) * 8) {
max = std::numeric_limits<uint32_t>::max();
} else if (bit_entropy == 1) {
max = 2;
}
std::mt19937 random_generator = SeedRandomGenerator(2147483647 + static_cast<size_t>(omp_get_thread_num()));
std::uniform_real_distribution<double> uniform_dist(0, max);
#pragma omp for schedule(static)
for (size_t i = 0; i < num_keys; ++i) {
keys[i] = static_cast<uint32_t>(uniform_dist(random_generator));
}
}
thrust::device_vector<uint32_t> device_vector(num_keys);
thrust::copy(keys.begin(), keys.end(), device_vector.begin());
CheckCudaError(cudaDeviceSynchronize());
size_t num_temporary_bytes = 0;
cub::DeviceRadixSort::SortKeys(
NULL, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
thrust::raw_pointer_cast(device_vector.data()), num_keys, 0, bit_entropy + 1, stream); // bit subrange is [begin_bit, end_bit), thus bit_entropy + 1
uint8_t* temporary_storage = nullptr;
CheckCudaError(cudaMalloc(reinterpret_cast<void**>(&temporary_storage), num_temporary_bytes));
cub::DeviceRadixSort::SortKeys(
(void*)temporary_storage, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
thrust::raw_pointer_cast(device_vector.data()), num_keys, 0, bit_entropy + 1, stream);
CheckCudaError(cudaStreamSynchronize(stream));
thrust::copy(device_vector.begin(), device_vector.end(), keys.begin());
CheckCudaError(cudaFree(temporary_storage));
if (std::is_sorted(keys.begin(), keys.end()) == false) {
std::cout << "Error: Sorting failed." << std::endl;
}
return 0;
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
您的代码的问题是您没有正确使用
SortKeys
。SortKeys
无法就地工作。您需要为排序后的数据提供单独的输出缓冲区。如果未排序的数组在排序后不再使用并且可以被覆盖,我建议使用采用
DoubleBuffer
的重载来减少内存使用。否则,将分配一个临时键数组,因为const Key*
输入无法被覆盖。The problem with your code is that you do not use
SortKeys
correctly.SortKeys
does not work in-place. You need to provide a separate output buffer for the sorted data.If the unsorted array is no longer used after sorting and can be overwritten, I recommend to use the overload which takes a
DoubleBuffer<Keys>
to reduce memory usage. Otherwise, a temporary keys array will be allocated since theconst Key*
input cannot be overwritten.