Hi all. I just start to grab opencl and I have a problem with non-blocking operation.
I could compile and execute simple example openCL code.
After did that, I wanted to see non-blocking operation of clEnqueueReadBuffer() function.
So, I modified code a little bit.
After modification, compile is ok. However, whenever I execute program, segmentation fault happen.
Here is my code. Bold line is added or modified line.
//-----------------------------------------------------------------------------------------------------------------------
#include <stdio.h>
#include "CL/cl.h"
#define DATA_SIZE 10
/////////////////////////////////////////////////////////////////////////////////////////
const char *KernelSource =
"__kernel void hello(__global float *input, __global float *output)\n"\
"{\n"\
" size_t id = get_global_id(0);\n"\
" output[id] = input[id] * input[id];\n"\
"}\n"\
"\n";
/////////////////////////////////////////////////////////////////////////////////////////
int main(void) {
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global;
//////////////////////////////////////////
cl_event event_queue1;
cl_uint num_events_in_waitlist1 = 1;
cl_event event_waitlist1[1] = { event_queue1 };
//////////////////////////////////////////
float inputData[DATA_SIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
float results[DATA_SIZE] = {0};
int i;
if (clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
return 1;
}
else {
printf("[INFO]Able to get platform_id: %d\n", platform_id);
printf("[INFO]num_of_platforms : %d\n\n", num_of_platforms);
}
// try to get a supported GPU device
// When you get GPU device, use CL_DEVICE_TYPE_GPU flag.
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) {
printf("Unable to get GPU device!!\n");
// try to get a supported CPU device
// When you get GPU device, use CL_DEVICE_TYPE_CPU flag.
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) {
printf("Unable to get device!!\n");
return 1;
}
}
// Now setup context properties - these context properties must be terminated with 0
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = (cl_context_properties) platform_id;
properties[2] = 0; // properties is declared with array[3]. So, the last element of array must be set to 0.
// create a context with the CPU device
context = clCreateContext(properties, 1, &device_id, NULL, NULL, &err);
// Create a command queue,
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
printf("[FATAL] Error building program\n");
return 1;
}
else {
printf("[INFO] Success to build program\n");
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "hello", &err);
// create buffers for the input and output
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
// load input data into input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputData, 0, NULL, NULL);
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = DATA_SIZE;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event_queue1);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, num_events_in_waitlist1, event_waitlist1, NULL);
clFinish(command_queue);
// print the results
printf("\noutput: ");
for (i=0; i<DATA_SIZE; i++) {
printf("%f ", results);
}
printf("\n\n");
// cleanup - Release OpenCL resources
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
printf("Program complete!! terminate.\n");
return 0;
}
//-----------------------------------------------------------------------------------------------------------------------
I tried to find which line cause segfault but I can't.
If someone have a idea for what's going on in this code. Please let me know.
My environment is Fedora10, AMD APP SDK 2.6, and Catalyst 12.4.
Thank you.
P.S. I have anther question. What's the maximum number of command queues that can execute concurrently?
Solved! Go to Solution.
Yes, ofcourse it would work that way. The issue in the first version was that you were not creating the array event_waitlist1 properly. The value of the variable event_queue1 was copied in the array, while the variable itself was initialized later while using clEnqueueNDRangeKernel.
IMHO, there is no need to create any array at all. Just use event_queue1 variable. In the wait list of readBuffer function pass its address only.
Hi,hlee133
The variable 'event_queue1' is being used without being initialized. Try this:clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, &event_queue1).
Thank you.
Thank you for response but I your code isn't what I want.
Basically, what i want to do is clEnqueueNDRangeKernel() generate event and clEnqueueReadBuffer() wait that event.
In your code, clEnqueueReadBuffer() generate event_queue1 not wait event_queue1.
Anyway, this code is work.
clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, 1, &event_queue1, NULL)
Hi hlee133,
OK,I see.Maybe you should try this:
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event_queue1);
cl_event event_waitlist1[1] = {event_queue1};
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, num_events_in_waitlist1, event_waitlist1, NULL);
You want this, right?
Yes, ofcourse it would work that way. The issue in the first version was that you were not creating the array event_waitlist1 properly. The value of the variable event_queue1 was copied in the array, while the variable itself was initialized later while using clEnqueueNDRangeKernel.
IMHO, there is no need to create any array at all. Just use event_queue1 variable. In the wait list of readBuffer function pass its address only.