OpenCL 仅读取/写入 1/4 的缓冲存储器,有时会崩溃

发布于 2024-12-07 19:27:37 字数 7773 浏览 0 评论 0原文

我对 OpenCL 有一个问题,那就是它执行整个命令队列,但它只读取 1/4 的输入,只写入 1/4 的结果。 无论迭代多少次,始终是 1/4。

而且它有时会随机崩溃..通过调试我没有得到任何信息,因为没有调试符号,它崩溃了(???等中的0x4c4783f6)

源代码:

#include <iostream>
#include <cl/cl.h>
#include <cassert>
#include <cstring>

const char *progsrc[] = {
"#pragma OPENCL EXTENSION cl_intel_printf : enable\n\
__kernel void add(__global const int *a, __global const int *b, __global int *out) \
{ \
    int tid = get_global_id(0);\
    out[tid] = tid/*a[tid]+b[tid]*/;\
    printf(\"krnl: %d = %d + %d \\n\", out[tid], a[tid], b[tid]);\
}"};

const int iterations = 20;

#define CLCheck(a) \
do\
{\
    if(a != CL_SUCCESS)\
    {\
        std::cerr << "OpenCL Error(" << a << ") at " << __LINE__ << std::endl;\
        return -1;\
    }\
} while(0)

int main()
{
    cl_int err = CL_SUCCESS;

    int *aH = NULL;
    int *bH = NULL;
    int *outH = NULL;

    cl_uint platnum, devnum;
    cl_device_id dev;
    cl_platform_id plat;
    err = clGetPlatformIDs(0, 0, &platnum);
    CLCheck(err);
    cl_platform_id pfids[platnum];
    err = clGetPlatformIDs(platnum, pfids, &platnum);
    CLCheck(err);

    if(!platnum)
    {
        std::cerr << "No platform found." << std::endl;
        return -1;
    }
    else
        std::cout << platnum << " OpenCL platform(s) found.\n" << std::endl;

    for(unsigned int i = 0; i != platnum; i++)
    {
        char buf[4096];

        err = clGetDeviceIDs(pfids[i], CL_DEVICE_TYPE_ALL, 0, 0, &devnum);
        CLCheck(err);
        cl_device_id devids[devnum];
        err = clGetDeviceIDs(pfids[i], CL_DEVICE_TYPE_ALL, devnum, devids, &devnum);
        CLCheck(err);
        if(!devnum)
        {
            std::cerr << "No device found." << std::endl;
            return -1;
        }
        else
            std::cout << " " << devnum << " OpenCL device(s) found.\n" << std::endl;

        for(unsigned int i2 = 0; i2 != devnum; i2++)
        {
            char buf[1024];
            std::cout << ": \n\tName: " << buf;
            err = clGetDeviceInfo(devids[i2], CL_DEVICE_VENDOR, 1024, buf, NULL);
            CLCheck(err);
            if(!strncmp(buf, "Intel", 5))
            {
                dev = devids[0];
                plat = pfids[i];
                std::cout << "\n\tFound Intel(R) OpenCL device.";
            }
        }
    }
    cl_context_properties ctxprop[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)plat, 0};
    cl_context ctx = clCreateContext(ctxprop, 1, &dev, NULL, NULL, &err);
    CLCheck(err);

    cl_program program = clCreateProgramWithSource(ctx, 1, progsrc, NULL, &err);
    CLCheck(err);
    err = clBuildProgram(program, 1, &dev, "", NULL, NULL);
    if(err != CL_SUCCESS)
    {
        size_t bufsz;
        err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, 0, &bufsz);
        char buf[bufsz];
        err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, bufsz, buf, &bufsz);
        std::cerr << "OpenCL program building failed: " << buf << std::endl;
        return -1;
    }
    err = clUnloadCompiler();
    CLCheck(err);

    aH = new int[iterations];
    bH = new int[iterations];
    outH = new int[iterations];
    memset(outH, 0, iterations*sizeof(int));
    for(int i = 0; i != iterations; i++)
    {
        aH[i] = i;
        bH[i] = i*2;
    }

    cl_mem aCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
    cl_mem bCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
    CLCheck(err);
    cl_mem outCL = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, iterations, NULL, &err);
    CLCheck(err);

    cl_kernel krnl = clCreateKernel(program, "add", &err);
    CLCheck(err);

    err = clSetKernelArg(krnl, 0, sizeof(aCL), &aCL);
    CLCheck(err);
    err = clSetKernelArg(krnl, 1, sizeof(bCL), &bCL);
    CLCheck(err);
    err = clSetKernelArg(krnl, 2, sizeof(outCL), &outCL);
    CLCheck(err);

    cl_command_queue cmdqueue = clCreateCommandQueue(ctx, dev, 0, &err);
    cl_event evt;
    size_t global_work_size[1] = { iterations };
    err = clEnqueueWriteBuffer(cmdqueue, aCL, CL_TRUE, 0, iterations, aH, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(cmdqueue, bCL, CL_TRUE, 0, iterations, bH, 0, NULL, NULL);
    err = clEnqueueNDRangeKernel(cmdqueue, krnl, 1, NULL, global_work_size, NULL, 0, NULL, &evt);
    err = clWaitForEvents(1, &evt);
    err = clEnqueueReadBuffer(cmdqueue, outCL, CL_TRUE, 0, iterations, outH, 0, NULL, &evt);

    for(int i = 0; i != iterations; i++)
    {
        std::cout << outH[i] << std::endl;
    }

    err = clReleaseEvent(evt);
    err = clReleaseCommandQueue(cmdqueue);
    err = clReleaseKernel(krnl);
    err = clReleaseMemObject(outCL);
    err = clReleaseMemObject(bCL);
    err = clReleaseMemObject(aCL);
    err = clReleaseProgram(program);
    err = clReleaseContext(ctx);

    if(aH)
        delete aH;
    if(bH)
        delete bH;
    if(outH)
        delete outH;
    return 0;
}

输出:

2 OpenCL platform(s) found.

Platform 0 :
        Name: NVIDIA CUDA
        Vendor: NVIDIA Corporation
        Profile: FULL_PROFILE
        Version: OpenCL 1.1 CUDA 4.0.1
        Extensions: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing c
l_nv_d3d9_sharing cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing c
l_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll

 1 OpenCL device(s) found.

  Device 0:
        Name: GeForce GT 425M
        Vendor: NVIDIA Corporation
        Profile: FULL_PROFILE
        Driver version: 280.26
        OpenCL version: OpenCL C 1.1
        Version: OpenCL 1.1 CUDA
        Extensions: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing c
l_nv_d3d9_sharing cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing c
l_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll  cl_khr_g
lobal_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32
_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64

Platform 1 :
        Name: Intel(R) OpenCL
        Vendor: Intel(R) Corporation
        Profile: FULL_PROFILE
        Version: OpenCL 1.1
        Extensions: cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_i
nt32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extende
d_atomics cl_khr_byte_addressable_store cl_intel_printf cl_ext_device_fission cl
_intel_immediate_execution cl_khr_gl_sharing cl_khr_icd

 1 OpenCL device(s) found.

  Device 0:
        Name: Intel(R) Core(TM) i3 CPU       M 370  @ 2.40GHz
        Found Intel(R) OpenCL device.
        Vendor: Intel(R) Corporation
        Profile: FULL_PROFILE
        Driver version: 1.1
        OpenCL version: OpenCL C 1.1
        Version: OpenCL 1.1 (Build 15293.6650)
        Extensions: cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_i
nt32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extende
d_atomics cl_khr_byte_addressable_store cl_intel_printf cl_ext_device_fission cl
_intel_immediate_execution cl_khr_gl_sharing

krnl: 0 = 0 + 0
krnl: 1 = 1 + 2
krnl: 2 = 2 + 4
krnl: 3 = 3 + 6
krnl: 4 = 4 + 8
krnl: 5 = 0 + 0
krnl: 6 = 0 + 0
krnl: 7 = 0 + 0
krnl: 16 = 0 + 492859489
krnl: 17 = 0 + -1042621749
krnl: 18 = 0 + 1310105771
krnl: 19 = 0 + 134230852
krnl: 8 = 0 + 0
krnl: 9 = 0 + 0
krnl: 10 = 0 + -1094462526
krnl: 11 = 0 + -1094462526
krnl: 12 = 0 + -1230120245
krnl: 13 = 0 + 500723958
krnl: 14 = 0 + 530164160
krnl: 15 = 0 + 492859489
0
1
2
3
4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

谢谢:)

I have a problem with OpenCL, which is that it executes the entire command queue, but it only reads only 1/4 of the input and writes only 1/4 of the result.
No matter how many iterations, always 1/4.

And also it sometimes randomly crashes..with debugging I dont get any information, since there is no debug symbols, where it crashes (0x4c4783f6 in ????, etc.)

Source code:

#include <iostream>
#include <cl/cl.h>
#include <cassert>
#include <cstring>

const char *progsrc[] = {
"#pragma OPENCL EXTENSION cl_intel_printf : enable\n\
__kernel void add(__global const int *a, __global const int *b, __global int *out) \
{ \
    int tid = get_global_id(0);\
    out[tid] = tid/*a[tid]+b[tid]*/;\
    printf(\"krnl: %d = %d + %d \\n\", out[tid], a[tid], b[tid]);\
}"};

const int iterations = 20;

#define CLCheck(a) \
do\
{\
    if(a != CL_SUCCESS)\
    {\
        std::cerr << "OpenCL Error(" << a << ") at " << __LINE__ << std::endl;\
        return -1;\
    }\
} while(0)

int main()
{
    cl_int err = CL_SUCCESS;

    int *aH = NULL;
    int *bH = NULL;
    int *outH = NULL;

    cl_uint platnum, devnum;
    cl_device_id dev;
    cl_platform_id plat;
    err = clGetPlatformIDs(0, 0, &platnum);
    CLCheck(err);
    cl_platform_id pfids[platnum];
    err = clGetPlatformIDs(platnum, pfids, &platnum);
    CLCheck(err);

    if(!platnum)
    {
        std::cerr << "No platform found." << std::endl;
        return -1;
    }
    else
        std::cout << platnum << " OpenCL platform(s) found.\n" << std::endl;

    for(unsigned int i = 0; i != platnum; i++)
    {
        char buf[4096];

        err = clGetDeviceIDs(pfids[i], CL_DEVICE_TYPE_ALL, 0, 0, &devnum);
        CLCheck(err);
        cl_device_id devids[devnum];
        err = clGetDeviceIDs(pfids[i], CL_DEVICE_TYPE_ALL, devnum, devids, &devnum);
        CLCheck(err);
        if(!devnum)
        {
            std::cerr << "No device found." << std::endl;
            return -1;
        }
        else
            std::cout << " " << devnum << " OpenCL device(s) found.\n" << std::endl;

        for(unsigned int i2 = 0; i2 != devnum; i2++)
        {
            char buf[1024];
            std::cout << ": \n\tName: " << buf;
            err = clGetDeviceInfo(devids[i2], CL_DEVICE_VENDOR, 1024, buf, NULL);
            CLCheck(err);
            if(!strncmp(buf, "Intel", 5))
            {
                dev = devids[0];
                plat = pfids[i];
                std::cout << "\n\tFound Intel(R) OpenCL device.";
            }
        }
    }
    cl_context_properties ctxprop[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)plat, 0};
    cl_context ctx = clCreateContext(ctxprop, 1, &dev, NULL, NULL, &err);
    CLCheck(err);

    cl_program program = clCreateProgramWithSource(ctx, 1, progsrc, NULL, &err);
    CLCheck(err);
    err = clBuildProgram(program, 1, &dev, "", NULL, NULL);
    if(err != CL_SUCCESS)
    {
        size_t bufsz;
        err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, 0, &bufsz);
        char buf[bufsz];
        err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, bufsz, buf, &bufsz);
        std::cerr << "OpenCL program building failed: " << buf << std::endl;
        return -1;
    }
    err = clUnloadCompiler();
    CLCheck(err);

    aH = new int[iterations];
    bH = new int[iterations];
    outH = new int[iterations];
    memset(outH, 0, iterations*sizeof(int));
    for(int i = 0; i != iterations; i++)
    {
        aH[i] = i;
        bH[i] = i*2;
    }

    cl_mem aCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
    cl_mem bCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
    CLCheck(err);
    cl_mem outCL = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, iterations, NULL, &err);
    CLCheck(err);

    cl_kernel krnl = clCreateKernel(program, "add", &err);
    CLCheck(err);

    err = clSetKernelArg(krnl, 0, sizeof(aCL), &aCL);
    CLCheck(err);
    err = clSetKernelArg(krnl, 1, sizeof(bCL), &bCL);
    CLCheck(err);
    err = clSetKernelArg(krnl, 2, sizeof(outCL), &outCL);
    CLCheck(err);

    cl_command_queue cmdqueue = clCreateCommandQueue(ctx, dev, 0, &err);
    cl_event evt;
    size_t global_work_size[1] = { iterations };
    err = clEnqueueWriteBuffer(cmdqueue, aCL, CL_TRUE, 0, iterations, aH, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(cmdqueue, bCL, CL_TRUE, 0, iterations, bH, 0, NULL, NULL);
    err = clEnqueueNDRangeKernel(cmdqueue, krnl, 1, NULL, global_work_size, NULL, 0, NULL, &evt);
    err = clWaitForEvents(1, &evt);
    err = clEnqueueReadBuffer(cmdqueue, outCL, CL_TRUE, 0, iterations, outH, 0, NULL, &evt);

    for(int i = 0; i != iterations; i++)
    {
        std::cout << outH[i] << std::endl;
    }

    err = clReleaseEvent(evt);
    err = clReleaseCommandQueue(cmdqueue);
    err = clReleaseKernel(krnl);
    err = clReleaseMemObject(outCL);
    err = clReleaseMemObject(bCL);
    err = clReleaseMemObject(aCL);
    err = clReleaseProgram(program);
    err = clReleaseContext(ctx);

    if(aH)
        delete aH;
    if(bH)
        delete bH;
    if(outH)
        delete outH;
    return 0;
}

output:

2 OpenCL platform(s) found.

Platform 0 :
        Name: NVIDIA CUDA
        Vendor: NVIDIA Corporation
        Profile: FULL_PROFILE
        Version: OpenCL 1.1 CUDA 4.0.1
        Extensions: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing c
l_nv_d3d9_sharing cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing c
l_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll

 1 OpenCL device(s) found.

  Device 0:
        Name: GeForce GT 425M
        Vendor: NVIDIA Corporation
        Profile: FULL_PROFILE
        Driver version: 280.26
        OpenCL version: OpenCL C 1.1
        Version: OpenCL 1.1 CUDA
        Extensions: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing c
l_nv_d3d9_sharing cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing c
l_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll  cl_khr_g
lobal_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32
_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64

Platform 1 :
        Name: Intel(R) OpenCL
        Vendor: Intel(R) Corporation
        Profile: FULL_PROFILE
        Version: OpenCL 1.1
        Extensions: cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_i
nt32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extende
d_atomics cl_khr_byte_addressable_store cl_intel_printf cl_ext_device_fission cl
_intel_immediate_execution cl_khr_gl_sharing cl_khr_icd

 1 OpenCL device(s) found.

  Device 0:
        Name: Intel(R) Core(TM) i3 CPU       M 370  @ 2.40GHz
        Found Intel(R) OpenCL device.
        Vendor: Intel(R) Corporation
        Profile: FULL_PROFILE
        Driver version: 1.1
        OpenCL version: OpenCL C 1.1
        Version: OpenCL 1.1 (Build 15293.6650)
        Extensions: cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_i
nt32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extende
d_atomics cl_khr_byte_addressable_store cl_intel_printf cl_ext_device_fission cl
_intel_immediate_execution cl_khr_gl_sharing

krnl: 0 = 0 + 0
krnl: 1 = 1 + 2
krnl: 2 = 2 + 4
krnl: 3 = 3 + 6
krnl: 4 = 4 + 8
krnl: 5 = 0 + 0
krnl: 6 = 0 + 0
krnl: 7 = 0 + 0
krnl: 16 = 0 + 492859489
krnl: 17 = 0 + -1042621749
krnl: 18 = 0 + 1310105771
krnl: 19 = 0 + 134230852
krnl: 8 = 0 + 0
krnl: 9 = 0 + 0
krnl: 10 = 0 + -1094462526
krnl: 11 = 0 + -1094462526
krnl: 12 = 0 + -1230120245
krnl: 13 = 0 + 500723958
krnl: 14 = 0 + 530164160
krnl: 15 = 0 + 492859489
0
1
2
3
4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

Thanks :)

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

命硬 2024-12-14 19:27:38

我不熟悉 openCL,但我认为您在这里缺少一些 sizeof

err = clEnqueueWriteBuffer(cmdqueue, aCL, CL_TRUE, 0, iterations, aH, 0, NULL, NULL);

可能应该是:

err = clEnqueueWriteBuffer(cmdqueue, aCL, CL_TRUE, 0, iterations * sizeof(int), aH, 0, NULL, NULL);

并且同样适用于以下类似的代码。

编辑:

这是您可能错过了一些 sizeof() 的另一个地方:

cl_mem aCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
cl_mem bCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
CLCheck(err);
cl_mem outCL = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, iterations, NULL, &err);
CLCheck(err);

I'm not familiar with openCL, but I think you're missing a few sizeof's here:

err = clEnqueueWriteBuffer(cmdqueue, aCL, CL_TRUE, 0, iterations, aH, 0, NULL, NULL);

should probably be:

err = clEnqueueWriteBuffer(cmdqueue, aCL, CL_TRUE, 0, iterations * sizeof(int), aH, 0, NULL, NULL);

And same applies the similar code following this.

EDIT:

And here's another place you may have missed a few sizeof()s:

cl_mem aCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
cl_mem bCL = clCreateBuffer(ctx, CL_MEM_READ_ONLY, iterations, NULL, &err);
CLCheck(err);
cl_mem outCL = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, iterations, NULL, &err);
CLCheck(err);
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文