Archives Discussions

hlee133 · ‎05-14-2012

Hi all. I just start to grab opencl and I have a problem with non-blocking operation.

I could compile and execute simple example openCL code.

After did that, I wanted to see non-blocking operation of clEnqueueReadBuffer() function.

So, I modified code a little bit.

After modification, compile is ok. However, whenever I execute program, segmentation fault happen.

Here is my code. Bold line is added or modified line.

//-----------------------------------------------------------------------------------------------------------------------

#include <stdio.h>

#include "CL/cl.h"

#define DATA_SIZE 10

/////////////////////////////////////////////////////////////////////////////////////////

const char *KernelSource =

"__kernel void hello(__global float *input, __global float *output)\n"\

"{\n"\

" size_t id = get_global_id(0);\n"\

" output[id] = input[id] * input[id];\n"\

"}\n"\

"\n";

/////////////////////////////////////////////////////////////////////////////////////////

int main(void) {

cl_context context;

cl_context_properties properties[3];

cl_kernel kernel;

cl_command_queue command_queue;

cl_program program;

cl_int err;

cl_uint num_of_platforms=0;

cl_platform_id platform_id;

cl_device_id device_id;

cl_uint num_of_devices=0;

cl_mem input, output;

size_t global;

//////////////////////////////////////////

cl_event event_queue1;

cl_uint num_events_in_waitlist1 = 1;

cl_event event_waitlist1[1] = { event_queue1 };

//////////////////////////////////////////

float inputData[DATA_SIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};

float results[DATA_SIZE] = {0};

int i;

if (clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS) {

printf("Unable to get platform_id\n");

return 1;

}

else {

printf("[INFO]Able to get platform_id: %d\n", platform_id);

printf("[INFO]num_of_platforms : %d\n\n", num_of_platforms);

}

// try to get a supported GPU device

// When you get GPU device, use CL_DEVICE_TYPE_GPU flag.

if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) {

printf("Unable to get GPU device!!\n");

// try to get a supported CPU device

// When you get GPU device, use CL_DEVICE_TYPE_CPU flag.

if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) {

printf("Unable to get device!!\n");

return 1;

}

// Now setup context properties - these context properties must be terminated with 0

properties[0] = CL_CONTEXT_PLATFORM;

properties[1] = (cl_context_properties) platform_id;

properties[2] = 0; // properties is declared with array[3]. So, the last element of array must be set to 0.

// create a context with the CPU device

context = clCreateContext(properties, 1, &device_id, NULL, NULL, &err);

// Create a command queue,

command_queue = clCreateCommandQueue(context, device_id, 0, &err);

// create a program from the kernel source code

program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &err);

// compile the program

if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {

printf("[FATAL] Error building program\n");

return 1;

}

else {

printf("[INFO] Success to build program\n");

}

// specify which kernel from the program to execute

kernel = clCreateKernel(program, "hello", &err);

// create buffers for the input and output

input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);

output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);

// load input data into input buffer

clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputData, 0, NULL, NULL);

// set the argument list for the kernel command

clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);

clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);

global = DATA_SIZE;

// enqueue the kernel command for execution

clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event_queue1);

// copy the results from out of the output buffer

clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, num_events_in_waitlist1, event_waitlist1, NULL);

clFinish(command_queue);

// print the results

printf("\noutput: ");

for (i=0; i<DATA_SIZE; i++) {

printf("%f ", results);

}

printf("\n\n");

// cleanup - Release OpenCL resources

clReleaseMemObject(input);

clReleaseMemObject(output);

clReleaseProgram(program);

clReleaseKernel(kernel);

clReleaseCommandQueue(command_queue);

clReleaseContext(context);

printf("Program complete!! terminate.\n");

return 0;

}

//-----------------------------------------------------------------------------------------------------------------------

I tried to find which line cause segfault but I can't.

If someone have a idea for what's going on in this code. Please let me know.

My environment is Fedora10, AMD APP SDK 2.6, and Catalyst 12.4.

Thank you.

P.S. I have anther question. What's the maximum number of command queues that can execute concurrently?

gautam_himanshu · ‎05-16-2012

Yes, ofcourse it would work that way. The issue in the first version was that you were not creating the array event_waitlist1 properly. The value of the variable event_queue1 was copied in the array, while the variable itself was initialized later while using clEnqueueNDRangeKernel.

IMHO, there is no need to create any array at all. Just use event_queue1 variable. In the wait list of readBuffer function pass its address only.

View solution in original post

Wenju · ‎05-14-2012

Hi,hlee133

The variable 'event_queue1' is being used without being initialized. Try this:clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, &event_queue1).

Thank you.

hlee133 · ‎05-16-2012

Thank you for response but I your code isn't what I want.

Basically, what i want to do is clEnqueueNDRangeKernel() generate event and clEnqueueReadBuffer() wait that event.

In your code, clEnqueueReadBuffer() generate event_queue1 not wait event_queue1.

Anyway, this code is work.

clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, 1, &event_queue1, NULL)

Wenju · ‎05-16-2012

Hi hlee133,

OK,I see.Maybe you should try this:

clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event_queue1);

cl_event event_waitlist1[1] = {event_queue1};

// copy the results from out of the output buffer

clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, num_events_in_waitlist1, event_waitlist1, NULL);

You want this, right?

gautam_himanshu · ‎05-16-2012

Yes, ofcourse it would work that way. The issue in the first version was that you were not creating the array event_waitlist1 properly. The value of the variable event_queue1 was copied in the array, while the variable itself was initialized later while using clEnqueueNDRangeKernel.

IMHO, there is no need to create any array at all. Just use event_queue1 variable. In the wait list of readBuffer function pass its address only.

Archives Discussions

Problem with non-blocking clEnqueueReadBuffer operation.