为什么我的 OpenCL 内核无法使用特定参数执行
我有一个在 JOCL 中运行的 OpenCL 内核,它通过了我的所有 JUnit 测试。我将代码移植到 C++ 中,这样我就可以在相同条件下分析内核。除一种情况外,驱动程序在所有情况下都工作正常。它在 JOCL 中运行得很好,所以我相信我的 C++ 代码中有些东西是错误的。我的代码如下,我已经对其进行了彻底审核。如果有人能帮助我发现问题所在,我将不胜感激。
驱动程序代码可以在 args 1 和 2 作为 8192、arg 3 作为 512 的情况下正常工作。它也可以在 args 1 和 2 作为 512 以及 arg 3 作为 8192 的情况下正常工作。Arg 4 始终只是 1,它将内核设置为实数。当我将args 1和2设置为262144,arg 3设置为16时,它执行,没有报告错误,没有seg错误,但内核最终没有更改数据。请注意,上述所有情况下的 arg 1*3 都等于 2^22。我相信在所有情况下我都会分配相同数量的浮点数。我很困惑。我无法让 OpenCL 告诉我出了什么问题:(
void HelperFunctions::callKernel(int windowSize, int primitivesPerDataFrame, int nInFramesThisCall, int realOrComplex)
{
// OpenCL Vars
cl_platform_id platform; // OpenCL platform
cl_device_id device; // OpenCL device
cl_context gpuContext; // OpenCL context
cl_command_queue commandQueue; // OpenCL command queue
cl_program clProgram; // OpenCL program
cl_kernel clkernel; // OpenCL kernel
void *dataHostBuffer; // Host buffer
void *windowDataHostBuffer; // Host buffer
cl_mem inData; // OpenCL device buffer
cl_mem windowData; // OpenCL device source buffer
size_t szKernelLength; // Byte size of kernel code
cl_int errCode; // Error code var
long gridX = 256;
long gridY = 16384;
long gridZ = 1;
size_t global_work_size[] = {gridX, gridY, gridZ};
size_t local_work_size[] = {gridX, 1, 1};
const char* cSourceCL = NULL; // Buffer to hold source for compilation
// Allocate and initialize host arrays
dataHostBuffer = (void *)malloc(sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall);
windowDataHostBuffer = (void *)malloc(sizeof(cl_float) * windowSize);
//Populate the data buffers
dataHostBuffer = generateRampData(primitivesPerDataFrame * nInFramesThisCall);
windowDataHostBuffer = blackman(windowSize);
//Get an OpenCL platform
errCode = clGetPlatformIDs(1, &platform, NULL);
cout << "Error Code: " << errCode << endl;
//Get the devices
errCode = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
cout << "Error Code: " << errCode << endl;
//Create the context
gpuContext = clCreateContext(0, 1, &device, NULL, NULL, &errCode);
cout << "Error Code: " << errCode << endl;
// Create a command-queue
commandQueue = clCreateCommandQueue(gpuContext, device, 0, &errCode);
// Read the OpenCL kernel in from source file
cSourceCL = oclLoadProgSource("/home/djkasht/workspaceBlueprint/bp/bp-trunk/bundles/CopperShark/src/coppershark/dsp/blocks/opencl/dsp/window/Window.cl", "", &szKernelLength);
szKernelLength = strlen(cSourceCL);
// Create the program
clProgram = clCreateProgramWithSource(gpuContext, 1, (const char **)&cSourceCL, &szKernelLength, &errCode);
cout << "Error Code: " << errCode << endl;
// Build the program
errCode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
cout << "Error Code: " << errCode << endl;
size_t log_size = 1000000 * sizeof(char);
char build_log[log_size];
size_t len;
errCode = clGetProgramBuildInfo(clProgram, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, &len);
cout << build_log << endl;
// Create the kernel
clkernel = clCreateKernel(clProgram, "window", &errCode);
cout << "Error Code: " << errCode << endl;
// Allocate the OpenCL buffer memory objects
inData = clCreateBuffer(gpuContext, CL_MEM_READ_WRITE, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, NULL, &errCode);
cout << "Error Code: " << errCode << endl;
windowData = clCreateBuffer(gpuContext, CL_MEM_READ_ONLY, sizeof(cl_float) * windowSize, NULL, &errCode);
cout << "Error Code: " << errCode << endl;
// Set the Argument values
errCode = clSetKernelArg(clkernel, 0, sizeof(cl_mem), (void*)&inData);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 1, sizeof(cl_mem), (void*)&windowData);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 2, sizeof(cl_int), (void*)&windowSize);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 3, sizeof(cl_int), (void*)&primitivesPerDataFrame);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 4, sizeof(cl_int), (void*)&nInFramesThisCall);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 5, sizeof(cl_int), (void*)&realOrComplex);
cout << "Error Code: " << errCode << endl;
// Asynchronous write of data to GPU device
errCode = clEnqueueWriteBuffer(commandQueue, inData, CL_FALSE, 0, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, dataHostBuffer, 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;
// Synchronous/blocking read of results, and check accumulated errors
errCode = clEnqueueWriteBuffer(commandQueue, windowData, CL_FALSE, 0, sizeof(cl_float) * windowSize, windowDataHostBuffer, 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;
errCode = clEnqueueNDRangeKernel(commandQueue, clkernel, 3, NULL, &(global_work_size[0]), &(local_work_size[0]), 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;
void* dataHostBuffer2 = (void *)malloc(sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall);
errCode = clEnqueueReadBuffer(commandQueue, inData, CL_TRUE, 0, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, dataHostBuffer2, 0, NULL, NULL);
}
I have an OpenCL kernel that I run in JOCL and it passes all my JUnit tests. I ported my code into C++ so I could profile the kernel under the same conditions. The driver works fine in all cases except one. It runs perfectly fine in JOCL so I believe something in my C++ code is wrong. My code is below, I have audited it to death. If anyone can help me spot what is wrong I'd appreciate it.
The driver code works fine with args 1 and 2 as 8192, arg 3 as 512. It also works fine with args 1 and 2 as 512 and arg 3 as 8192. Arg 4 is always just 1, which sets the kernel to real numbers. When I set args 1 and 2 to 262144 and arg 3 to 16, it executes, no errors are reported, no seg faults, but the kernel does not change the data in the end. Note that arg 1*3 in all cases above is equal to 2^22. I believe I am allocating the same amount of floats in all cases. I am stumped. I can't get OpenCL to tell me what is wrong :(
void HelperFunctions::callKernel(int windowSize, int primitivesPerDataFrame, int nInFramesThisCall, int realOrComplex)
{
// OpenCL Vars
cl_platform_id platform; // OpenCL platform
cl_device_id device; // OpenCL device
cl_context gpuContext; // OpenCL context
cl_command_queue commandQueue; // OpenCL command queue
cl_program clProgram; // OpenCL program
cl_kernel clkernel; // OpenCL kernel
void *dataHostBuffer; // Host buffer
void *windowDataHostBuffer; // Host buffer
cl_mem inData; // OpenCL device buffer
cl_mem windowData; // OpenCL device source buffer
size_t szKernelLength; // Byte size of kernel code
cl_int errCode; // Error code var
long gridX = 256;
long gridY = 16384;
long gridZ = 1;
size_t global_work_size[] = {gridX, gridY, gridZ};
size_t local_work_size[] = {gridX, 1, 1};
const char* cSourceCL = NULL; // Buffer to hold source for compilation
// Allocate and initialize host arrays
dataHostBuffer = (void *)malloc(sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall);
windowDataHostBuffer = (void *)malloc(sizeof(cl_float) * windowSize);
//Populate the data buffers
dataHostBuffer = generateRampData(primitivesPerDataFrame * nInFramesThisCall);
windowDataHostBuffer = blackman(windowSize);
//Get an OpenCL platform
errCode = clGetPlatformIDs(1, &platform, NULL);
cout << "Error Code: " << errCode << endl;
//Get the devices
errCode = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
cout << "Error Code: " << errCode << endl;
//Create the context
gpuContext = clCreateContext(0, 1, &device, NULL, NULL, &errCode);
cout << "Error Code: " << errCode << endl;
// Create a command-queue
commandQueue = clCreateCommandQueue(gpuContext, device, 0, &errCode);
// Read the OpenCL kernel in from source file
cSourceCL = oclLoadProgSource("/home/djkasht/workspaceBlueprint/bp/bp-trunk/bundles/CopperShark/src/coppershark/dsp/blocks/opencl/dsp/window/Window.cl", "", &szKernelLength);
szKernelLength = strlen(cSourceCL);
// Create the program
clProgram = clCreateProgramWithSource(gpuContext, 1, (const char **)&cSourceCL, &szKernelLength, &errCode);
cout << "Error Code: " << errCode << endl;
// Build the program
errCode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
cout << "Error Code: " << errCode << endl;
size_t log_size = 1000000 * sizeof(char);
char build_log[log_size];
size_t len;
errCode = clGetProgramBuildInfo(clProgram, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, &len);
cout << build_log << endl;
// Create the kernel
clkernel = clCreateKernel(clProgram, "window", &errCode);
cout << "Error Code: " << errCode << endl;
// Allocate the OpenCL buffer memory objects
inData = clCreateBuffer(gpuContext, CL_MEM_READ_WRITE, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, NULL, &errCode);
cout << "Error Code: " << errCode << endl;
windowData = clCreateBuffer(gpuContext, CL_MEM_READ_ONLY, sizeof(cl_float) * windowSize, NULL, &errCode);
cout << "Error Code: " << errCode << endl;
// Set the Argument values
errCode = clSetKernelArg(clkernel, 0, sizeof(cl_mem), (void*)&inData);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 1, sizeof(cl_mem), (void*)&windowData);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 2, sizeof(cl_int), (void*)&windowSize);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 3, sizeof(cl_int), (void*)&primitivesPerDataFrame);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 4, sizeof(cl_int), (void*)&nInFramesThisCall);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 5, sizeof(cl_int), (void*)&realOrComplex);
cout << "Error Code: " << errCode << endl;
// Asynchronous write of data to GPU device
errCode = clEnqueueWriteBuffer(commandQueue, inData, CL_FALSE, 0, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, dataHostBuffer, 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;
// Synchronous/blocking read of results, and check accumulated errors
errCode = clEnqueueWriteBuffer(commandQueue, windowData, CL_FALSE, 0, sizeof(cl_float) * windowSize, windowDataHostBuffer, 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;
errCode = clEnqueueNDRangeKernel(commandQueue, clkernel, 3, NULL, &(global_work_size[0]), &(local_work_size[0]), 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;
void* dataHostBuffer2 = (void *)malloc(sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall);
errCode = clEnqueueReadBuffer(commandQueue, inData, CL_TRUE, 0, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, dataHostBuffer2, 0, NULL, NULL);
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
更新,我想通了!问题出在我的内核中。我使用恒定内存。我的java代码解释了这一点并以文本方式操作代码,以便如果我的arg 2的缓冲区大小> 16384,它将 __constant 更改为 __global。我应该知道这一点,但我忘了......
UPDATE, I figured it out! The problem is in my kernel. I use constant memory. My java code accounts for this and textually manipulates the code so that if my buffer size for arg 2 > 16384, it changes the __constant to __global. I should have known this, but I forgot...