我可以从 C++ 调用 CUDA 运行时函数吗?代码不是由 nvcc 编译的?

发布于 2024-09-24 21:22:58 字数 110 浏览 3 评论 0原文

有什么方法可以调用 CUDA 运行时函数调用,例如

cudaMemcpy(...);

在使用常规 C++ 编译器编译的 .cpp 文件中?

Is there any way I can call CUDA runtime function calls such as

cudaMemcpy(...);

in a .cpp file, compiled with a regular C++ compiler?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

嘿咻 2024-10-01 21:22:58

编辑:这里有一个示例,但不再找到,但大多数该示例的内容复制如下。

调用者 C(但可以是 C++)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

extern void kernel_wrapper(int *a, int *b);

int main(int argc, char *argv[])
{
   int a = 2;
   int b = 3;

   kernel_wrapper(&a, &b);

   return 0;
}

被调用者 (CUDA)

__global__ void kernel(int *a, int *b)
{
   int tx = threadIdx.x;

   switch( tx )
   {
case 0:
    *a = *a + 10;
    break;
case 1:
    *b = *b + 3;
    break;
default:
    break;
   }
}

void kernel_wrapper(int *a, int *b)
{
   int *d_1, *d_2;
   dim3 threads( 2, 1 );
   dim3 blocks( 1, 1 );

   cudaMalloc( (void **)&d_1, sizeof(int) );
   cudaMalloc( (void **)&d_2, sizeof(int) );

   cudaMemcpy( d_1, a, sizeof(int), cudaMemcpyHostToDevice );
   cudaMemcpy( d_2, b, sizeof(int), cudaMemcpyHostToDevice );

   kernel<<< blocks, threads >>>( a, b );

   cudaMemcpy( a, d_1, sizeof(int), cudaMemcpyDeviceToHost );
   cudaMemcpy( b, d_2, sizeof(int), cudaMemcpyDeviceToHost );

   cudaFree(d_1);
   cudaFree(d_2);
}

EDIT: There was an example here but it's not longer found, but most of the example was copied below.

The caller C (but could be C++)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

extern void kernel_wrapper(int *a, int *b);

int main(int argc, char *argv[])
{
   int a = 2;
   int b = 3;

   kernel_wrapper(&a, &b);

   return 0;
}

The Callee (CUDA)

__global__ void kernel(int *a, int *b)
{
   int tx = threadIdx.x;

   switch( tx )
   {
case 0:
    *a = *a + 10;
    break;
case 1:
    *b = *b + 3;
    break;
default:
    break;
   }
}

void kernel_wrapper(int *a, int *b)
{
   int *d_1, *d_2;
   dim3 threads( 2, 1 );
   dim3 blocks( 1, 1 );

   cudaMalloc( (void **)&d_1, sizeof(int) );
   cudaMalloc( (void **)&d_2, sizeof(int) );

   cudaMemcpy( d_1, a, sizeof(int), cudaMemcpyHostToDevice );
   cudaMemcpy( d_2, b, sizeof(int), cudaMemcpyHostToDevice );

   kernel<<< blocks, threads >>>( a, b );

   cudaMemcpy( a, d_1, sizeof(int), cudaMemcpyDeviceToHost );
   cudaMemcpy( b, d_2, sizeof(int), cudaMemcpyDeviceToHost );

   cudaFree(d_1);
   cudaFree(d_2);
}
花辞树 2024-10-01 21:22:58

与@PreetSangha(提供了一个非常有用的答案)类似,我在将其作为 extern ... 运行时遇到了一些问题,所以我只想添加对我有用的解决方案(包括模板化函数调用) )。

这是我的示例的代码(完整的 CUDA 代码被排除,因为它已经在 @PreetSangha 的示例中),并且应该给出其工作原理的主要想法。它被编译并确认可以在Linux机器上运行。我还没有在 Windows 上尝试过,但应该类似。在我的场景中,我想尝试 intfloatdouble,但可以添加更多模板。

// main.cpp
#include "wrapper.hpp"

int main(int argc, char *argv[]) {
    runOnGPU(1,2,3);
}
// cuda.cu
#include "wrapper.hpp"

template<typename T>
__global__ static void matMultCUDA(const T* a, const T* b, T* c, int n) {
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    
    T value = 0;
    if(col < n && row < n)
        for(int j=0; j < n; j++){
            value += a[row*n + j] * b[j*n+col];
        }
        
    c[row*n + col] = value;
}

bool InitCUDA(bool b) {
    /* CUDA Initialization */
}

template<typename T>
float runOnGPU(T *a, T *b, int n) {
    /* Do CUDA things here :D */
    matMultCUDA<<<dimGrid, dimBlock>>>(cuda_a , cuda_b , cuda_c , n);
}


template float runOnGPU<int>(int* a, int* b, int n);
template float runOnGPU<float>(float* a, float* b, int n);
template float runOnGPU<double>(double* a, double* b, int n);
// wrapper.hpp

bool InitCUDA(bool b);

template<typename T>
float runOnGPU(T *a, T *b, int n);
# makefile
CXX = g++
CXXFLAGS = -O3
NVCC = nvcc
NVCCFLAGS = -O3

LDFLAGS = -lcudart

OBJS = main.o cuda.o

all: program

program: $(OBJS)
        $(CXX) $(CXXFLAGS) -L/usr/local/cuda-11/lib64 cuda.o main.o -o program.out $(LDFLAGS)

main.o: main.cpp wrapper.hpp
        $(CXX) $(CXXFLAGS) -c main.cpp

cuda.o: cuda.cu wrapper.hpp
        $(NVCC) $(NVCCFLAGS) -c cuda.cu

Similarly to @PreetSangha (who provided a very useful answer), I had some issues when running it as extern ... so I would just like to add the solution which worked for me (including templated function calls).

This is the code for my example (the full CUDA code is excluded because it is already in @PreetSangha's example) and is supposed to give a main idea on how it works. It was compiled and confirmed to run on a linux machine. I haven't tried it on windows yet but should be similar. In my scenario I wanted to try int, float and double but more templates could be added.

// main.cpp
#include "wrapper.hpp"

int main(int argc, char *argv[]) {
    runOnGPU(1,2,3);
}
// cuda.cu
#include "wrapper.hpp"

template<typename T>
__global__ static void matMultCUDA(const T* a, const T* b, T* c, int n) {
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    
    T value = 0;
    if(col < n && row < n)
        for(int j=0; j < n; j++){
            value += a[row*n + j] * b[j*n+col];
        }
        
    c[row*n + col] = value;
}

bool InitCUDA(bool b) {
    /* CUDA Initialization */
}

template<typename T>
float runOnGPU(T *a, T *b, int n) {
    /* Do CUDA things here :D */
    matMultCUDA<<<dimGrid, dimBlock>>>(cuda_a , cuda_b , cuda_c , n);
}


template float runOnGPU<int>(int* a, int* b, int n);
template float runOnGPU<float>(float* a, float* b, int n);
template float runOnGPU<double>(double* a, double* b, int n);
// wrapper.hpp

bool InitCUDA(bool b);

template<typename T>
float runOnGPU(T *a, T *b, int n);
# makefile
CXX = g++
CXXFLAGS = -O3
NVCC = nvcc
NVCCFLAGS = -O3

LDFLAGS = -lcudart

OBJS = main.o cuda.o

all: program

program: $(OBJS)
        $(CXX) $(CXXFLAGS) -L/usr/local/cuda-11/lib64 cuda.o main.o -o program.out $(LDFLAGS)

main.o: main.cpp wrapper.hpp
        $(CXX) $(CXXFLAGS) -c main.cpp

cuda.o: cuda.cu wrapper.hpp
        $(NVCC) $(NVCCFLAGS) -c cuda.cu
空心空情空意 2024-10-01 21:22:58

您可以

g++ I/usr/local/cuda/include filename.cpp -o obj -L/usr/local/cuda/lib64 -lcudart

用于编译或

nvcc filename.cu

you can use

g++ I/usr/local/cuda/include filename.cpp -o obj -L/usr/local/cuda/lib64 -lcudart

for compile or

nvcc filename.cu
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文