返回介绍

Cuda C program - an Outline

发布于 2025-02-25 23:44:05 字数 4199 浏览 0 评论 0 收藏 0

The following are the minimal ingredients for a Cuda C program:

  • The kernel. This is the function that will be executed in parallel on the GPU.
  • Main C program
  • allocates memory on the GPU
  • copies data in CPU memory to GPU memory
  • ‘launches’ the kernel (just a function call with some extra arguments)
  • copies data from GPU memory back to CPU memory

Kernel Code

%%file kernel.hold

__global void square_kernel(float *d_out, float *d_in){

  int i = thread.Idx;   # This is a unique identifier of the thread
  float f = d_in[i]     # Why this statement?
  d_out[i] = f*f;       # d_out is what we will copy back to the host memory

}
Overwriting kernel.hold

CPU Code

%%file main.hold

int main(int argc, char **argv){
const int ARRAY_SIZE = 64;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    float h_in[ARRAY_SIZE];

    for (int i =0;i<ARRAY_SIZE;i++){
        h_in[i] = float(i);
    } float h_out[ARRAY_SIZE];

    float *d_in;  // These are device memory pointers
    float *d_out;

    cudaMalloc((void **) &d_in, ARRAY_BYTES);
    cudaMalloc((void **) &d_out, ARRAY_BYTES);

    cudaMemcpy(d_in, h_in, ARRAY_BYTES,cudaMemcpyHostToDevice);

    square_kernel<<<1,ARRAY_SIZE>>>(d_out,d_in);

    cudaMemcpy(h_out,d_out,ARRAY_BYTES,cudaMemcpyDeviceToHost);

    for (int i = 0;i<ARRAY_SIZE;i++){
        printf("%f", h_out[i]);
        printf(((i % 4) != 3 ? "\t" : "\n"));
    }

    cudaFree(d_in);



}
Overwriting main.hold

Shared Memory

Lifted from: https://www.cac.cornell.edu/vw/gpu/shared_mem_exec.aspx

%%file shared_mem_ex.cu

#include <stdio.h>
#include <stdlib.h>

#define N 1024*1024
#define BLOCKSIZE 1024

__global__
void share_ary_oper(int *ary, int *ary_out)
{
    // Thread index
        int tx = threadIdx.x;
        int idx=blockDim.x*blockIdx.x + threadIdx.x;
        __shared__ int part_ary[BLOCKSIZE];

        part_ary[tx]=ary[idx];
        part_ary[tx]=part_ary[tx]*10;
        ary_out[idx]=part_ary[tx];
        __syncthreads();
}

int main(){

        int *device_array, *device_array_out;
        int *host_array, *host_array_out;
        int i, nblk;
        float k;
        size_t size = N*sizeof(int);

//Device memory
        cudaMalloc((void **)&device_array, size);
        cudaMalloc((void **)&device_array_out, size);
//Host memory
//cudaMallocHost() produces pinned memoty on the host
        cudaMallocHost((void **)&host_array, size);
        cudaMallocHost((void **)&host_array_out, size);

        for(i=0;i<N;i++)
        {
                host_array[i]=i;
                host_array_out[i]=0;
        }
        cudaMemcpy(device_array, host_array, size, cudaMemcpyHostToDevice);
        cudaMemcpy(device_array_out, host_array_out, size, cudaMemcpyHostToDevice);
        nblk=N/BLOCKSIZE;
        share_ary_oper<<<nblk, BLOCKSIZE>>>(device_array, device_array_out);
        cudaMemcpy(host_array, device_array, size, cudaMemcpyDeviceToHost);
        cudaMemcpy(host_array_out, device_array_out, size, cudaMemcpyDeviceToHost);


    printf("Printing elements 10-15 of output array\n");
        for (i=N;i<N;i++)
        {
                k=host_array_out[i]-i*10;
                if(k<0.1)
                        printf("Incorrect IX %d=%.1f\n",i, k);
        }
        for (i=10;i<15;i++)
                printf("host_array_out[%d]=%d\n", i, host_array_out[i]);

        cudaFree(device_array);
        cudaFree(host_array);
        cudaFree(device_array_out);
        cudaFree(host_array_out);
        cudaDeviceReset();
        return EXIT_SUCCESS;
}
Overwriting shared_mem_ex.cu

Makefile

%%file Makefile

CC=nvcc
CFLAGS=-Wall

shared_mem.o: shared_mem_ex.cu
     $(CC) $(CFAGS) -c shared_mem_ex.cu

clean:
     rm -f *.o
Overwriting Makefile

Compile

! make
nvcc  -c shared_mem_ex.cu

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
    我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
    原文