I'm trying to compile a kernel written using Clang cl-std=clc++2021.
The clang compilation of the kernel runs fine:

clang --target=spirv32 -Xclang -no-opaque-pointers -cl-std=clc++2021 ERK.cpp -o ERK.spv

My kernel code implements an explicit Runge-Kutta 4 integrator scheme:

void addVec(float* a, const float*b, const int Nx)
    for (int i = 0; i < Nx; i++)
        a[i] += b[i];

template <int Nstage_ERK, class f_ODE>
struct impl_ERK {

    constexpr static int NX = f_ODE::NX;
    impl_ERK(const float* _A, const float* _b, const float* _c): p_A(_A), p_b(_b), p_c(_c){}
    void solve(constant float* xk, float* xk_1, float t, float dt, const float* P) 
        float K[Nstage_ERK * NX];
        float xk_stage[NX];
        for (int j = 0; j < NX; j++)
            xk_stage[j] = xk[j];
        f_ODE::solve(t, xk_stage, K, P);

        for (int i = 1; i < Nstage_ERK; i++) {
            float* k_i = &K[i * NX];
            float c_i = p_c[i];

            float ak_sum[NX];
            for (int j = 0; j < NX; j++) {
                ak_sum[j] = xk[j];
            for (int j = 0; j < i; j++) {
                ak_sum[j] += p_A[i * Nstage_ERK + j] * K[j * NX];
                xk_stage[j] = xk[j] + dt*ak_sum[j];

            f_ODE::solve(t + c_i * dt, xk_stage, k_i,  P);

            for (int j = 0; j < NX; j++) {
                xk_1[j] += dt * p_b[i] * k_i[j];
    const float* p_A; 
    const float* p_b;
    const float* p_c;

template <class f_ODE>
struct ERK4 : public impl_ERK<4,f_ODE> 
    ERK4(const float* _A, const float* _b, const float* _c) : impl_ERK<4,f_ODE>(_A, _b, _c){}

struct f_ODE_1
    constexpr static int NX = 3;
    constexpr static int NP = 3;
    static void solve(const float t, const float* xk, float* xdot, const float* P)
        float alpha = P[0];
        float beta = P[1];
        float N_pop = P[2];

        xdot[0] = -beta*xk[0]*xk[1]/N_pop;
        xdot[1] = beta*xk[0]*xk[1]/N_pop - alpha*xk[1];
        xdot[2] = alpha*xk[1];

// Example that uses find_min in a kernel with array of int4.
__kernel void compute(constant float* x0, global float* x1) 
    const float A[4*4] = {.0f,.0f,.0f,.0f,
    const float b[4] = {1.f/6, 1.f/3, 1.f/3, 1.f/6};
    const float c[4] = {.0f,.5f,.5f,1.f};
    const float dt = .5f;
    const float t = .0f;
    const float R0 = 1.2;
    const float alpha = .9;
    const float beta = R0*alpha;
    const float N_pop = 1e6;
    const float P[f_ODE_1::NP] = {alpha, beta, N_pop};
    float res[3];
    ERK4<f_ODE_1> integrator(A, b, c);
    integrator.solve(x0, res, t, dt, P);

Creating a program with clCreateProgramWithIL(.) and building it works fine, but I'm however not able to create any kernels using clCreateKernelsInProgram(.)

    clInstance.program = clCreateProgramWithIL(clInstance.context, (const void*) programBinary.data(), sizeof(char)*programBinary.length(), &err);

    assert(err == CL_SUCCESS);

    std::string build_options = "-I " + cl_generator_dir + " -I " + ERK_Kernel_dir;
    /*Step 6: Build program. */
    int status = clBuildProgram(clInstance.program, 1, clInstance.device_ids.data(), build_options.c_str(), NULL, NULL);
    if (status == CL_BUILD_PROGRAM_FAILURE)
        // Determine the size of the log
        size_t log_size;
        clGetProgramBuildInfo(clInstance.program, clInstance.device_ids[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

        // Allocate memory for the log
        char *log = (char *)malloc(log_size);

        // Get the log
        clGetProgramBuildInfo(clInstance.program, clInstance.device_ids[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

        // Print the log
        printf("%s\n", log);

    float x0[3] = {1.0,2.0,3.0};
    float x_res[3] = {-10,-10,-10};

    size_t inputBufferSize = sizeof(float)*3;
    size_t outputBufferSize = sizeof(float)*3;

    cl_mem inputBuffer = clCreateBuffer(clInstance.context, CL_MEM_READ_ONLY, inputBufferSize, NULL, &err);
    assert(err == CL_SUCCESS);

    cl_mem outputBuffer = clCreateBuffer(clInstance.context, CL_MEM_WRITE_ONLY, outputBufferSize, NULL, &err);
    assert(err == CL_SUCCESS);

    err = clEnqueueWriteBuffer(clInstance.commandQueue, inputBuffer, CL_TRUE, 0, inputBufferSize, x0, 0, NULL, NULL);

    assert(err == CL_SUCCESS);

    // cl_kernel kernel = clCreateKernel(clInstance.program, "_ZNU3AS48impl_ERKILi4E7f_ODE_1E5solveEPU3AS2fPU3AS4fffPU3AS4Kf", &err);
    // assert(err == CL_SUCCESS);
    cl_kernel kernel;
    cl_uint num_kernels_ret = 0;
    err = clCreateKernelsInProgram(clInstance.program, 0, NULL, &num_kernels_ret);

    err = clCreateKernelsInProgram(clInstance.program, num_kernels_ret, &kernel, NULL);

Why isn't my kernel function __kernel void compute(.) recognized by openCL?


err = clCreateKernelsInProgram(clInstance.program, 0, NULL, &num_kernels_ret);

err = clCreateKernelsInProgram(clInstance.program, num_kernels_ret, &kernel, NULL);


cl_kernel kernel = clCreateKernel(clInstance.program, "compute", &err);

results in error code CL_INVALID_KERNEL_NAME

The Khronos registry mentions cl_khr_spir which is used to add support for creating OpenCL program objects from SPIRV. Is this necessary when compiling from clang with -clc-std=c++2021?

九厘米的零° 2025-02-06 05:24:38



err = clCreateKernelsInProgram(clInstance.program, 0, NULL, &num_kernels_ret);

err = clCreateKernelsInProgram(clInstance.program, num_kernels_ret, &kernel, NULL);


cl_kernel kernel = clCreateKernel(clInstance.program, "compute", &err);

The problem turned out to be unrelated to the creation of the kernel, and rather related to issues with an early termination of file reading.

With ERK.spv loaded properly OpenCL has no issues with kernel creation, using both

err = clCreateKernelsInProgram(clInstance.program, 0, NULL, &num_kernels_ret);

err = clCreateKernelsInProgram(clInstance.program, num_kernels_ret, &kernel, NULL);


cl_kernel kernel = clCreateKernel(clInstance.program, "compute", &err);

