CUDA 上的块间屏障

发布于 2024-12-08 21:44:44 字数 1992 浏览 0 评论 0原文

我想在 CUDA 上实现块间屏障，但遇到了严重的问题。

我不明白为什么它不起作用。

#include <iostream>
#include <cstdlib>
#include <ctime>

#define SIZE 10000000
#define BLOCKS 100 

using namespace std;

struct Barrier {
    int *count;

    __device__ void wait() {
        atomicSub(count, 1);
        while(*count)
            ;
    }

    Barrier() {
        int blocks = BLOCKS;
        cudaMalloc((void**) &count, sizeof(int));
        cudaMemcpy(count, &blocks, sizeof(int), cudaMemcpyHostToDevice);
    }

    ~Barrier() {
        cudaFree(count);
    }
};


__global__ void sum(int* vec, int* cache, int *sum, Barrier barrier)
{
    int tid = blockIdx.x;

    int temp = 0;
    while(tid < SIZE) {
        temp += vec[tid];
        tid += gridDim.x;
    }

    cache[blockIdx.x] = temp;

    barrier.wait();

    if(blockIdx.x == 0) {
        for(int i = 0 ; i < BLOCKS; ++i)
            *sum += cache[i];
    }
}

int main()
{
    int* vec_host = (int *) malloc(SIZE * sizeof(int));    
    for(int i = 0; i < SIZE; ++i)
        vec_host[i] = 1;

    int *vec_dev;
    int *sum_dev;
    int *cache;
    int sum_gpu = 0;

    cudaMalloc((void**) &vec_dev, SIZE * sizeof(int));
    cudaMemcpy(vec_dev, vec_host, SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**) &sum_dev, sizeof(int));
    cudaMemcpy(sum_dev, &sum_gpu, sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**) &cache, BLOCKS * sizeof(int));
    cudaMemset(cache, 0, BLOCKS * sizeof(int));

    Barrier barrier;
    sum<<<BLOCKS, 1>>>(vec_dev, cache, sum_dev, barrier);

    cudaMemcpy(&sum_gpu, sum_dev, sizeof(int), cudaMemcpyDeviceToHost);

    cudaFree(vec_dev);
    cudaFree(sum_dev);
    cudaFree(cache);
    free(vec_host);
    return 0;
}

事实上，即使我将 wait() 重写为下面的

    __device__ void wait() {
        while(*count != 234124)
            ;
    }

程序也会正常退出。但我希望在这种情况下会出现无限循环。

原文

I want to implement a Inter-block barrier on CUDA, but encountering a serious problem.

I cannot figure out why it does not work.

#include <iostream>
#include <cstdlib>
#include <ctime>

#define SIZE 10000000
#define BLOCKS 100 

using namespace std;

struct Barrier {
    int *count;

    __device__ void wait() {
        atomicSub(count, 1);
        while(*count)
            ;
    }

    Barrier() {
        int blocks = BLOCKS;
        cudaMalloc((void**) &count, sizeof(int));
        cudaMemcpy(count, &blocks, sizeof(int), cudaMemcpyHostToDevice);
    }

    ~Barrier() {
        cudaFree(count);
    }
};


__global__ void sum(int* vec, int* cache, int *sum, Barrier barrier)
{
    int tid = blockIdx.x;

    int temp = 0;
    while(tid < SIZE) {
        temp += vec[tid];
        tid += gridDim.x;
    }

    cache[blockIdx.x] = temp;

    barrier.wait();

    if(blockIdx.x == 0) {
        for(int i = 0 ; i < BLOCKS; ++i)
            *sum += cache[i];
    }
}

int main()
{
    int* vec_host = (int *) malloc(SIZE * sizeof(int));    
    for(int i = 0; i < SIZE; ++i)
        vec_host[i] = 1;

    int *vec_dev;
    int *sum_dev;
    int *cache;
    int sum_gpu = 0;

    cudaMalloc((void**) &vec_dev, SIZE * sizeof(int));
    cudaMemcpy(vec_dev, vec_host, SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**) &sum_dev, sizeof(int));
    cudaMemcpy(sum_dev, &sum_gpu, sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**) &cache, BLOCKS * sizeof(int));
    cudaMemset(cache, 0, BLOCKS * sizeof(int));

    Barrier barrier;
    sum<<<BLOCKS, 1>>>(vec_dev, cache, sum_dev, barrier);

    cudaMemcpy(&sum_gpu, sum_dev, sizeof(int), cudaMemcpyDeviceToHost);

    cudaFree(vec_dev);
    cudaFree(sum_dev);
    cudaFree(cache);
    free(vec_host);
    return 0;
}

In fact, even if I rewrite the wait() as the following

    __device__ void wait() {
        while(*count != 234124)
            ;
    }

The program exits normally. But I expect to get an infinite loop in this case.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

无声无音无过去 2024-12-15 21:44:45

不幸的是，您想要实现的目标（块间通信/同步）在 CUDA 中并不完全可能。 CUDA 编程指南指出“线程块需要独立执行：必须能够以任何顺序、并行或串行执行它们”。此限制的原因是为了允许线程块调度程序具有灵活性，并允许代码随内核数量进行不可知的扩展。唯一支持的块间同步方法是启动另一个内核：内核启动（在同一流内）是隐式同步点。

您的代码违反了块独立性规则，因为它隐式假设内核的线程块同时执行（参见并行）。但不能保证他们会这样做。为了了解为什么这对您的代码很重要，让我们考虑一个假设的只有一个核心的 GPU。我们还假设您只想启动两个线程块。在这种情况下，你的 spinloop 内核实际上会死锁。如果线程块 0 首先在核心上调度，那么当它到达屏障时，它将永远循环，因为线程块 1 永远没有机会更新计数器。因为线程块零永远不会被换出（线程块执行到完成），所以它在旋转时会使核心之一的线程块挨饿。

有些人尝试过像您这样的方案，并取得了成功，因为调度程序碰巧以假设成立的方式安排了块。例如，曾经有一段时间，启动与 GPU 具有 SM 一样多的线程块意味着这些块真正是并发执行的。但当驱动程序或 CUDA 运行时或 GPU 的更改使该假设无效并破坏了他们的代码时，他们感到失望。

对于您的应用程序，尝试找到一个不依赖于块间同步的解决方案，因为（除非对 CUDA 编程模型进行含义更改）这是不可能的。

回复收藏 0 原文

夏雨凉 2024-12-15 21:44:45

块到块同步是可能的。请参阅此论文。
论文并没有详细介绍其工作原理，但它依赖于 __syncthreads() 的操作；为当前块创建暂停屏障，...同时等待其他块到达同步点。

论文中没有提到的一项是，只有当块的数量足够小或者 SM 的数量对于手头的任务来说足够大时，同步才有可能。即，如果您有 4 个 SM 并尝试同步 5 个块，.. 内核将死锁。

通过他们的方法，我能够将长串行任务分散到多个块中，与单块方法相比，轻松节省 30% 的时间。即块同步对我有用。

回复收藏 0 原文

七月上 2024-12-15 21:44:45

看起来像是编译器优化问题。我不擅长阅读 PTX 代码，但看起来编译器根本省略了 while 循环（即使使用 -O0 编译）：

.loc    3   41  0
cvt.u64.u32     %rd7, %ctaid.x; // Save blockIdx.x to rd7
ld.param.u64    %rd8, [__cudaparm__Z3sumPiS_S_7Barrier_cache];
mov.s32     %r8, %ctaid.x; // Now calculate ouput address
mul.wide.u32    %rd9, %r8, 4;
add.u64     %rd10, %rd8, %rd9;
st.global.s32   [%rd10+0], %r5; // Store result to cache[blockIdx.x]
.loc    17  128 0
ld.param.u64    %rd11, [__cudaparm__Z3sumPiS_S_7Barrier_barrier+0]; // Get *count to rd11
mov.s32     %r9, -1; // put -1 to r9
atom.global.add.s32     %r10, [%rd11], %r9; // Do AtomicSub, storing the result to r10 (will be unused)
cvt.u32.u64     %r11, %rd7; // Put blockIdx.x saved in rd7 to r11
mov.u32     %r12, 0; // Put 0 to r12
setp.ne.u32     %p3, %r11, %r12; // if(blockIdx.x == 0)
@%p3 bra    $Lt_0_5122;
ld.param.u64    %rd12, [__cudaparm__Z3sumPiS_S_7Barrier_sum];
ld.global.s32   %r13, [%rd12+0];
mov.s64     %rd13, %rd8;
mov.s32     %r14, 0;

以防万一对于 CPU 代码，通过使用 volatile 前缀声明变量可以防止这种行为。但即使我们将 count 声明为 int __device__ count （并适当地更改代码），添加 volatile 说明符只会中断编译（出现错误 < code>类型“volatile int *”的参数与类型“void *”的参数不兼容）

我建议查看 CUDA SDK 中的 threadFenceReduction 示例。他们的做法与您几乎相同，但是执行最终求和的块是在运行时选择的，而不是预定义的，并且消除了 while 循环，因为全局变量上的自旋锁应该非常慢。

Looks like compiler optimizations issue. I'm not good with reading PTX-code, but it looks like the compiler have omitted the while-loop at all (even when compiled with -O0):

.loc    3   41  0
cvt.u64.u32     %rd7, %ctaid.x; // Save blockIdx.x to rd7
ld.param.u64    %rd8, [__cudaparm__Z3sumPiS_S_7Barrier_cache];
mov.s32     %r8, %ctaid.x; // Now calculate ouput address
mul.wide.u32    %rd9, %r8, 4;
add.u64     %rd10, %rd8, %rd9;
st.global.s32   [%rd10+0], %r5; // Store result to cache[blockIdx.x]
.loc    17  128 0
ld.param.u64    %rd11, [__cudaparm__Z3sumPiS_S_7Barrier_barrier+0]; // Get *count to rd11
mov.s32     %r9, -1; // put -1 to r9
atom.global.add.s32     %r10, [%rd11], %r9; // Do AtomicSub, storing the result to r10 (will be unused)
cvt.u32.u64     %r11, %rd7; // Put blockIdx.x saved in rd7 to r11
mov.u32     %r12, 0; // Put 0 to r12
setp.ne.u32     %p3, %r11, %r12; // if(blockIdx.x == 0)
@%p3 bra    $Lt_0_5122;
ld.param.u64    %rd12, [__cudaparm__Z3sumPiS_S_7Barrier_sum];
ld.global.s32   %r13, [%rd12+0];
mov.s64     %rd13, %rd8;
mov.s32     %r14, 0;

In case of CPU code, such behavior is prevented by declaring the variable with volatile prefix. But even if we declare count as int __device__ count (and appropriately change the code), adding volatile specifier just breaks compilation (with errors loke argument of type "volatile int *" is incompatible with parameter of type "void *")

I suggest looking at threadFenceReduction example from CUDA SDK. There they are doing pretty much the same as you do, but the block to do final summation is chosen in runtime, rather than predefined, and the while-loop is eliminated, because spin-lock on global variable should be very slow.

回复收藏 0 原文

~没有更多了~