NVidia 利用三维网格推力任意变换

发布于 2025-01-09 18:14:13 字数 2291 浏览 6 评论 0原文

我想使用 NVidia Thrust 在 GPU 上并行化以下嵌套 for 循环。

// complex multiplication
inline __host__ __device__  float2 operator* (const float2 a, const float2 b) {
    return make_float2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
}

int main()
{
    const int M = 100, N = 100, K = 100;
    float2 A[M*N*K], E[M*N*K];

    float vec_M[M], vec_N[N], vec_K[K];

    for (int i_M = 0; i_M < M; i_M++)
    {
        for (int i_N = 0; i_N < N; i_N++)
        {
            for (int i_K = 0; i_K < K; i_K++)
            {
                unsigned int idx = i_M * (N * K) + i_N * K + i_K; // linear index
                float2 a = A[idx];
                float b = vec_M[i_M];
                float c = vec_N[i_N];
                float d = vec_K[i_K];
                float arg1 = (b + c) / d;
                float2 val1 = make_float2(cosf(arg1), sinf(arg1));
                E[idx].x = a * val1; // calls the custom multiplication operator
            }
        }
    }

    return 0;
}

我可以将其实现为

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D.begin(), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D.end(),   E.end())),
                 custom_functor());

类似于 thrust/examples/atory_transformation 中给出的示例.cu。

但是，为此，我必须创建矩阵 B、C 和 D，每个矩阵的大小为 M x N x K，即使三个大小为 M、N 和 K 的向量足以表示相应的值。创建这些矩阵需要 (3 * M * N * K) / (M + N + K) 比仅使用向量更多的内存。

是否有类似于 MATLAB 的 meshgrid() 函数，允许使用三个向量创建 3D 网格，而无需实际存储这些矩阵？在我看来，伪代码看起来像这样：

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), meshgrid(vec_M.begin(), vec_N.begin(), vec_K.begin()), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   meshgrid(vec_M.end(),   vec_N.end(),   vec_K.end()),   E.end())),
                 custom_functor());

原文

I want to parallelize the following nested for loop on the GPU using NVidia thrust.

// complex multiplication
inline __host__ __device__  float2 operator* (const float2 a, const float2 b) {
    return make_float2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
}

int main()
{
    const int M = 100, N = 100, K = 100;
    float2 A[M*N*K], E[M*N*K];

    float vec_M[M], vec_N[N], vec_K[K];

    for (int i_M = 0; i_M < M; i_M++)
    {
        for (int i_N = 0; i_N < N; i_N++)
        {
            for (int i_K = 0; i_K < K; i_K++)
            {
                unsigned int idx = i_M * (N * K) + i_N * K + i_K; // linear index
                float2 a = A[idx];
                float b = vec_M[i_M];
                float c = vec_N[i_N];
                float d = vec_K[i_K];
                float arg1 = (b + c) / d;
                float2 val1 = make_float2(cosf(arg1), sinf(arg1));
                E[idx].x = a * val1; // calls the custom multiplication operator
            }
        }
    }

    return 0;
}

I could implement this as

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D.begin(), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D.end(),   E.end())),
                 custom_functor());

similar to the example given in thrust/examples/arbitrary_transformation.cu.

However, to do that, I have to create the matrices B, C, and D, each of size M x N x K, even though three vectors of size M, N, and K are sufficient to represent the corresponding values. Creating these matrices requires (3 * M * N * K) / (M + N + K) more memory than just using the vectors.

Is there something like MATLAB's meshgrid() function that allows to create a 3D grid using three vectors without actually storing these matrices? In my mind the pseudo code would look like this:

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), meshgrid(vec_M.begin(), vec_N.begin(), vec_K.begin()), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   meshgrid(vec_M.end(),   vec_N.end(),   vec_K.end()),   E.end())),
                 custom_functor());

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

上课铃就是安魂曲 2025-01-16 18:14:13

您可以简单地将嵌套循环折叠为单个循环，并将 for_each 与计数迭代器一起使用。在函子中，您需要从单个循环变量计算三个索引。

#include <iostream>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

struct Op{
    int N;
    int M;
    int K;

    Op(int n, int m, int k) : N(n), M(m), K(k){}

    void operator()(int x){
        const int k = x % K;
        const int m = (x / K) % M;
        const int n = (x / (K * M));
        std::cerr << "x = " << x << " (" << n << ", " << m << ", " << k << ")\n";
    }
};

int main(){
    const int N = 2;
    const int M = 3;
    const int K = 2;

    Op op{N,M,K};

    std::cerr << "with nested loops\n";
    for(int n = 0; n < N; n++){
        for(int m = 0; m < M; m++){
            for(int k = 0; k < K; k++){
                std::cerr << "(" << n << ", " << m << ", " << k << ")\n";        
            }
        }
    }
    std::cerr << "\n";

    std::cerr << "with single loop\n";
    for(int x = 0; x < N * M * K; x++){
        op(x);
    }
    std::cerr << "\n";

    std::cerr << "with thrust\n";
    auto xIterator = thrust::make_counting_iterator(0);
    thrust::for_each(thrust::seq, xIterator, xIterator + N*M*K, op);
}

输出

with nested loops
(0, 0, 0)
(0, 0, 1)
(0, 1, 0)
(0, 1, 1)
(0, 2, 0)
(0, 2, 1)
(1, 0, 0)
(1, 0, 1)
(1, 1, 0)
(1, 1, 1)
(1, 2, 0)
(1, 2, 1)

with single loop
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

with thrust
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

旁注：
CUDA 有一个函数 sincosf，它可能比使用相同参数调用 sin 和 cos 更有效。 https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g9456ff9df91a3874180d89a94b36fd46

You can simply collapse the nested loop into a single loop and use for_each with a counting iterator. In the functor, you need to calculate the three indices from the single loop variable.

#include <iostream>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

struct Op{
    int N;
    int M;
    int K;

    Op(int n, int m, int k) : N(n), M(m), K(k){}

    void operator()(int x){
        const int k = x % K;
        const int m = (x / K) % M;
        const int n = (x / (K * M));
        std::cerr << "x = " << x << " (" << n << ", " << m << ", " << k << ")\n";
    }
};

int main(){
    const int N = 2;
    const int M = 3;
    const int K = 2;

    Op op{N,M,K};

    std::cerr << "with nested loops\n";
    for(int n = 0; n < N; n++){
        for(int m = 0; m < M; m++){
            for(int k = 0; k < K; k++){
                std::cerr << "(" << n << ", " << m << ", " << k << ")\n";        
            }
        }
    }
    std::cerr << "\n";

    std::cerr << "with single loop\n";
    for(int x = 0; x < N * M * K; x++){
        op(x);
    }
    std::cerr << "\n";

    std::cerr << "with thrust\n";
    auto xIterator = thrust::make_counting_iterator(0);
    thrust::for_each(thrust::seq, xIterator, xIterator + N*M*K, op);
}

Output

with nested loops
(0, 0, 0)
(0, 0, 1)
(0, 1, 0)
(0, 1, 1)
(0, 2, 0)
(0, 2, 1)
(1, 0, 0)
(1, 0, 1)
(1, 1, 0)
(1, 1, 1)
(1, 2, 0)
(1, 2, 1)

with single loop
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

with thrust
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

On a side note:
CUDA has a function sincosf which may be more efficient than calling both sin and cos with the same argument. https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g9456ff9df91a3874180d89a94b36fd46

回复收藏 0 原文

~没有更多了~