当前位置：文江博客话题详情

CUDA 和 HPC 中的一维问题

发布于 2024-10-21 18:26:00 字数 302 浏览 10 评论 0原文

我正在寻找 CUDA 和 HPC 中的一些一维问题，例如 Black Scholes。

我所说的一维问题是指所有工作都在一维数组上完成的问题。虽然矩阵乘法可以用这种方式表达，但我想要的问题是基本问题只是一维的。

我正在尝试为 CUDA 开发一个一维库，并且需要一些基准问题来测试它。我意识到很多现实世界的问题都是用二维表示的，我真的很想看到一些现实世界的一维问题。

谢谢。

编辑：感谢您的所有答案。如果答案包含更多 HPC 问题（例如 Black Scholes），而不仅仅是通用算法，那就太好了。谢谢。

原文

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

稚然 2024-10-28 18:26:00

并行编程中的一个常见问题是减少：给定一个数字数组，您必须计算“前缀和”，即每个元素存储所有前面元素的总和（+本身或不。我更喜欢包含）。

这是一个相当简单的问题，但由于它经常在更复杂的算法中重复多次，因此高效至关重要。

另一个常见问题是排序。

已经有一些关于该主题的论文，以这篇论文为例：

在此处输入链接描述< /a>

我认为这是一个很好的起点，可以在此基础上解决更大的问题。

回复收藏 0 原文

揪着可爱 2024-10-28 18:26:00

可以用于 1 到 3 维的一个简单问题是热方程。有几种不同的数值方法可以解决它，其中一些可以并行实现。

至少适用于 OpenMp 和 MPI 的方法是有限差分法。我想如果你将它与一个聪明的模板结合起来，你应该能够在 Cuda C 中有效地实现它。

回复收藏 0 原文

假装爱人 2024-10-28 18:26:00

热方程提供了一个经典的一维示例。

下面，我将利用 Jacobi 解决方案针对该主题发布一个具体的、完全运行的 CPU/GPU 示例。请注意，提供了两个时间步内核，一个不使用共享内存，一个使用共享内存。

#include <stdio.h>
#include <stdlib.h>

#include <thrust\device_vector.h>

#include "Utilities.cuh"

#define BLOCKSIZE  512

/****************************/
/* CPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DCPU(float * __restrict__ h_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    float *h_DeltaT = (float *)malloc(N * sizeof(float));

    // --- Enforcing boundary condition at the left end.
    *h_T = T0;
    h_DeltaT[0] = 0.f;

    float current_max;
    do {
        // --- Internal region between the two boundaries.
        for (int i = 1; i < N - 1; i++) h_DeltaT[i] = dt * alpha * ((h_T[i - 1] + h_T[i + 1] - 2.f * h_T[i]) / (dx * dx));

        // --- Enforcing boundary condition at the right end.
        h_DeltaT[N - 1] = dt * 2.f * ((k * ((h_T[N - 2] - h_T[N - 1]) / dx) + Q_N_1) / (dx * rho * cp));

        // --- Update the temperature and find the maximum DeltaT over all nodes
        current_max = h_DeltaT[0]; // --- Remember: h_DeltaT[0] = 0
        for (int i = 1; i < N; i++)
        {
            h_T[i] = h_T[i] + h_DeltaT[i]; // h_T[0] keeps
            current_max = abs(h_DeltaT[i]) > current_max ? abs(h_DeltaT[i]) : current_max;
        }

        // --- Increase iteration counter
        (*Niter)++;

    } while (*Niter < maxIterNumber && current_max > maxErr);

    delete [] h_DeltaT;
}

/**************************/
/* GPU CALCULATION KERNEL */
/**************************/
__global__ void HeatEquation1DGPU_IterationKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < N) {

        // --- Internal region between the two boundaries.
        if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid]  = dt * alpha *((d_T[tid - 1] + d_T[tid + 1] - 2.f * d_T[tid]) / (dx * dx));
        // --- Enforcing boundary condition at the left end.
        if (tid == 0)                    d_DeltaT[0]    = 0.f;
        // --- Enforcing boundary condition at the right end.
        if (tid == N - 1)                d_DeltaT[tid]  = dt * 2.f * ((k * ((d_T[tid - 1] - d_T[tid]) / dx) + Q_N_1) / (dx * rho * cp));

        // --- Update the temperature
        d_T[tid] = d_T[tid] + d_DeltaT[tid]; 

        d_DeltaT[tid] = abs(d_DeltaT[tid]);
    }

}

__global__ void HeatEquation1DGPU_IterationSharedKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // --- Shared memory has 0, 1, ..., BLOCKSIZE - 1, BLOCKSIZE locations, so it has BLOCKSIZE locations + 2 (left and right) halo cells.
    __shared__ float d_T_shared[BLOCKSIZE + 2];             // --- Need to know BLOCKSIZE beforehand

    if (tid < N) {

        // --- Load data from global memory to shared memory locations 1, 2, ..., BLOCKSIZE - 1
        d_T_shared[threadIdx.x + 1] = d_T[tid];                 

        // --- Left halo cell
        if ((threadIdx.x == 0) && (tid > 0)) { d_T_shared[0] = d_T[tid - 1]; }          

        // --- Right halo cell
        if ((threadIdx.x == blockDim.x - 1) && (tid < N - 1)) { d_T_shared[threadIdx.x + 2] = d_T[tid + 1]; } 

        __syncthreads();

        // --- Internal region between the two boundaries.
        if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid]  = dt * alpha *((d_T_shared[threadIdx.x] + d_T_shared[threadIdx.x + 2] - 2.f * d_T_shared[threadIdx.x + 1]) / (dx * dx));

        // --- Enforcing boundary condition at the left end.
        if (tid == 0)                    d_DeltaT[0]    = 0.f;

        // --- Enforcing boundary condition at the right end.
        if (tid == N - 1)                d_DeltaT[tid]  = dt * 2.f * ((k * ((d_T_shared[threadIdx.x] - d_T_shared[threadIdx.x + 1]) / dx) + Q_N_1) / (dx * rho * cp));

        // --- Update the temperature
        d_T[tid] = d_T[tid] + d_DeltaT[tid]; 

        d_DeltaT[tid] = abs(d_DeltaT[tid]);
    }

}

/****************************/
/* GPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DGPU(float * __restrict__ d_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    // --- Absolute values of DeltaT
    float *d_DeltaT;    gpuErrchk(cudaMalloc(&d_DeltaT, N * sizeof(float)));

    // --- Enforcing boundary condition at the left end.
    gpuErrchk(cudaMemcpy(d_T, &T0, sizeof(float), cudaMemcpyHostToDevice));

    float current_max = 0.f;
    do {
        //HeatEquation1DGPU_IterationKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
        HeatEquation1DGPU_IterationSharedKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);

        thrust::device_ptr<float> d = thrust::device_pointer_cast(d_DeltaT);  
        current_max = thrust::reduce(d, d + N, current_max, thrust::maximum<float>());

        // --- Increase iteration counter
        (*Niter)++;

    } while (*Niter < maxIterNumber && current_max > maxErr);

    gpuErrchk(cudaFree(d_DeltaT));
}

/********/
/* MAIN */
/********/
int main()
{
    // --- See https://en.wikipedia.org/wiki/Thermal_diffusivity

    // --- Parameters of the problem
    const float k           = 0.19f;                    // --- Thermal conductivity [W / (m * K)]
    const float rho         = 930.f;                    // --- Density [kg / m^3]
    const float cp          = 1340.f;                   // --- Specific heat capacity [J / (kg * K)]
    const float alpha       = k / (rho * cp);           // --- Thermal diffusivity [m^2 / s]
    const float length      = 1.6f;                     // --- Total length of the domain [m]
    const int N             = 64 * BLOCKSIZE;           // --- Number of grid points
    const float dx          = (length / (float)(N - 1));// --- Discretization step [m]
    const float dt          = (float)(dx * dx / (4.f * alpha));
                                                        // --- Time step [s]
    const float T0          = 0.f;                      // --- Temperature at the first end of the domain [C]
    const float Q_N_1       = 10.f;                     // --- Heat flux at the second end of the domain [W / m^2]
    const float maxErr      = 1.0e-5f;                  // --- Maximum admitted DeltaT
    const int maxIterNumber = 10.0 / dt;                // --- Number of overall time steps

    /********************/
    /* GPU CALCULATIONS */
    /********************/
    float *h_T_final_device = (float *)malloc(N * sizeof(float));       // --- Final "host-side" result of GPU calculations
    int Niter_GPU = 0;                                                  // --- Iteration counter for GPU calculations

    // --- Device temperature allocation and initialization
    float *d_T;     gpuErrchk(cudaMalloc(&d_T, N * sizeof(float)));
    gpuErrchk(cudaMemset(d_T, 0, N * sizeof(float))); 

    // --- GPU calculations
    HeatEquation1DGPU(d_T, &Niter_GPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);

    // --- Transfer the GPU calculation results from device to host
    gpuErrchk(cudaMemcpy(h_T_final_device, d_T, N * sizeof(float), cudaMemcpyDeviceToHost));

    /********************/
    /* CPU CALCULATIONS */
    /********************/
    // --- Host temperature allocation and initialization
    float *h_T_final_host = (float *)malloc(N * sizeof(float));
    memset(h_T_final_host, 0, N * sizeof(float));

    int Niter_CPU = 0;

    HeatEquation1DCPU(h_T_final_host, &Niter_CPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);

    /************************/
    /* CHECKING THE RESULTS */
    /************************/
    for (int i = 0; i < N; i++) {
        printf("Node = %i; T_host = %3.10f; T_device = %3.10f\n", i, h_T_final_host[i], h_T_final_device[i]);
        if (h_T_final_host[i] != h_T_final_device[i]) {
            printf("Error at i = %i; T_host = %f; T_device = %f\n", i, h_T_final_host[i], h_T_final_device[i]);
            return 0;
        }
    }

    printf("Test passed!\n");

    delete [] h_T_final_device;
    gpuErrchk(cudaFree(d_T));

    return 0;
}

A classical 1D example is provided by the heat equation.

Below, I'm posting a concrete, fully worked CPU/GPU example on this topic exploiting the Jacobi solution scheme. Please, note that two time-step kernels are provided, one not using shared memory and one using shared memory.

#include <stdio.h>
#include <stdlib.h>

#include <thrust\device_vector.h>

#include "Utilities.cuh"

#define BLOCKSIZE  512

/****************************/
/* CPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DCPU(float * __restrict__ h_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    float *h_DeltaT = (float *)malloc(N * sizeof(float));

    // --- Enforcing boundary condition at the left end.
    *h_T = T0;
    h_DeltaT[0] = 0.f;

    float current_max;
    do {
        // --- Internal region between the two boundaries.
        for (int i = 1; i < N - 1; i++) h_DeltaT[i] = dt * alpha * ((h_T[i - 1] + h_T[i + 1] - 2.f * h_T[i]) / (dx * dx));

        // --- Enforcing boundary condition at the right end.
        h_DeltaT[N - 1] = dt * 2.f * ((k * ((h_T[N - 2] - h_T[N - 1]) / dx) + Q_N_1) / (dx * rho * cp));

        // --- Update the temperature and find the maximum DeltaT over all nodes
        current_max = h_DeltaT[0]; // --- Remember: h_DeltaT[0] = 0
        for (int i = 1; i < N; i++)
        {
            h_T[i] = h_T[i] + h_DeltaT[i]; // h_T[0] keeps
            current_max = abs(h_DeltaT[i]) > current_max ? abs(h_DeltaT[i]) : current_max;
        }

        // --- Increase iteration counter
        (*Niter)++;

    } while (*Niter < maxIterNumber && current_max > maxErr);

    delete [] h_DeltaT;
}

/**************************/
/* GPU CALCULATION KERNEL */
/**************************/
__global__ void HeatEquation1DGPU_IterationKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < N) {

        // --- Internal region between the two boundaries.
        if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid]  = dt * alpha *((d_T[tid - 1] + d_T[tid + 1] - 2.f * d_T[tid]) / (dx * dx));
        // --- Enforcing boundary condition at the left end.
        if (tid == 0)                    d_DeltaT[0]    = 0.f;
        // --- Enforcing boundary condition at the right end.
        if (tid == N - 1)                d_DeltaT[tid]  = dt * 2.f * ((k * ((d_T[tid - 1] - d_T[tid]) / dx) + Q_N_1) / (dx * rho * cp));

        // --- Update the temperature
        d_T[tid] = d_T[tid] + d_DeltaT[tid]; 

        d_DeltaT[tid] = abs(d_DeltaT[tid]);
    }

}

__global__ void HeatEquation1DGPU_IterationSharedKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // --- Shared memory has 0, 1, ..., BLOCKSIZE - 1, BLOCKSIZE locations, so it has BLOCKSIZE locations + 2 (left and right) halo cells.
    __shared__ float d_T_shared[BLOCKSIZE + 2];             // --- Need to know BLOCKSIZE beforehand

    if (tid < N) {

        // --- Load data from global memory to shared memory locations 1, 2, ..., BLOCKSIZE - 1
        d_T_shared[threadIdx.x + 1] = d_T[tid];                 

        // --- Left halo cell
        if ((threadIdx.x == 0) && (tid > 0)) { d_T_shared[0] = d_T[tid - 1]; }          

        // --- Right halo cell
        if ((threadIdx.x == blockDim.x - 1) && (tid < N - 1)) { d_T_shared[threadIdx.x + 2] = d_T[tid + 1]; } 

        __syncthreads();

        // --- Internal region between the two boundaries.
        if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid]  = dt * alpha *((d_T_shared[threadIdx.x] + d_T_shared[threadIdx.x + 2] - 2.f * d_T_shared[threadIdx.x + 1]) / (dx * dx));

        // --- Enforcing boundary condition at the left end.
        if (tid == 0)                    d_DeltaT[0]    = 0.f;

        // --- Enforcing boundary condition at the right end.
        if (tid == N - 1)                d_DeltaT[tid]  = dt * 2.f * ((k * ((d_T_shared[threadIdx.x] - d_T_shared[threadIdx.x + 1]) / dx) + Q_N_1) / (dx * rho * cp));

        // --- Update the temperature
        d_T[tid] = d_T[tid] + d_DeltaT[tid]; 

        d_DeltaT[tid] = abs(d_DeltaT[tid]);
    }

}

/****************************/
/* GPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DGPU(float * __restrict__ d_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho, 
                       const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
    // --- Absolute values of DeltaT
    float *d_DeltaT;    gpuErrchk(cudaMalloc(&d_DeltaT, N * sizeof(float)));

    // --- Enforcing boundary condition at the left end.
    gpuErrchk(cudaMemcpy(d_T, &T0, sizeof(float), cudaMemcpyHostToDevice));

    float current_max = 0.f;
    do {
        //HeatEquation1DGPU_IterationKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
        HeatEquation1DGPU_IterationSharedKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);

        thrust::device_ptr<float> d = thrust::device_pointer_cast(d_DeltaT);  
        current_max = thrust::reduce(d, d + N, current_max, thrust::maximum<float>());

        // --- Increase iteration counter
        (*Niter)++;

    } while (*Niter < maxIterNumber && current_max > maxErr);

    gpuErrchk(cudaFree(d_DeltaT));
}

/********/
/* MAIN */
/********/
int main()
{
    // --- See https://en.wikipedia.org/wiki/Thermal_diffusivity

    // --- Parameters of the problem
    const float k           = 0.19f;                    // --- Thermal conductivity [W / (m * K)]
    const float rho         = 930.f;                    // --- Density [kg / m^3]
    const float cp          = 1340.f;                   // --- Specific heat capacity [J / (kg * K)]
    const float alpha       = k / (rho * cp);           // --- Thermal diffusivity [m^2 / s]
    const float length      = 1.6f;                     // --- Total length of the domain [m]
    const int N             = 64 * BLOCKSIZE;           // --- Number of grid points
    const float dx          = (length / (float)(N - 1));// --- Discretization step [m]
    const float dt          = (float)(dx * dx / (4.f * alpha));
                                                        // --- Time step [s]
    const float T0          = 0.f;                      // --- Temperature at the first end of the domain [C]
    const float Q_N_1       = 10.f;                     // --- Heat flux at the second end of the domain [W / m^2]
    const float maxErr      = 1.0e-5f;                  // --- Maximum admitted DeltaT
    const int maxIterNumber = 10.0 / dt;                // --- Number of overall time steps

    /********************/
    /* GPU CALCULATIONS */
    /********************/
    float *h_T_final_device = (float *)malloc(N * sizeof(float));       // --- Final "host-side" result of GPU calculations
    int Niter_GPU = 0;                                                  // --- Iteration counter for GPU calculations

    // --- Device temperature allocation and initialization
    float *d_T;     gpuErrchk(cudaMalloc(&d_T, N * sizeof(float)));
    gpuErrchk(cudaMemset(d_T, 0, N * sizeof(float))); 

    // --- GPU calculations
    HeatEquation1DGPU(d_T, &Niter_GPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);

    // --- Transfer the GPU calculation results from device to host
    gpuErrchk(cudaMemcpy(h_T_final_device, d_T, N * sizeof(float), cudaMemcpyDeviceToHost));

    /********************/
    /* CPU CALCULATIONS */
    /********************/
    // --- Host temperature allocation and initialization
    float *h_T_final_host = (float *)malloc(N * sizeof(float));
    memset(h_T_final_host, 0, N * sizeof(float));

    int Niter_CPU = 0;

    HeatEquation1DCPU(h_T_final_host, &Niter_CPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);

    /************************/
    /* CHECKING THE RESULTS */
    /************************/
    for (int i = 0; i < N; i++) {
        printf("Node = %i; T_host = %3.10f; T_device = %3.10f\n", i, h_T_final_host[i], h_T_final_device[i]);
        if (h_T_final_host[i] != h_T_final_device[i]) {
            printf("Error at i = %i; T_host = %f; T_device = %f\n", i, h_T_final_host[i], h_T_final_device[i]);
            return 0;
        }
    }

    printf("Test passed!\n");

    delete [] h_T_final_device;
    gpuErrchk(cudaFree(d_T));

    return 0;
}

回复收藏 0 原文