4 Replies Latest reply on May 16, 2012 11:40 PM by gautam.himanshu

    Problem with non-blocking clEnqueueReadBuffer operation.

    hlee133

      Hi all. I just start to grab opencl and I have a problem with non-blocking operation.

       

      I could compile and execute simple example openCL code.

      After did that, I wanted to see non-blocking operation of clEnqueueReadBuffer() function.

      So, I modified code a little bit.

       

      After modification, compile is ok. However, whenever I execute program, segmentation fault happen.

      Here is my code. Bold line is added or modified line.

      //-----------------------------------------------------------------------------------------------------------------------

      #include <stdio.h>

      #include "CL/cl.h"

       

      #define DATA_SIZE 10

      /////////////////////////////////////////////////////////////////////////////////////////

      const char *KernelSource =

                          "__kernel void hello(__global float *input, __global float *output)\n"\

                          "{\n"\

                          "    size_t id = get_global_id(0);\n"\

                          "    output[id] = input[id] * input[id];\n"\

                          "}\n"\

                          "\n";

      /////////////////////////////////////////////////////////////////////////////////////////

        

      int main(void) {

                cl_context context;

                cl_context_properties properties[3];

                cl_kernel kernel;

                cl_command_queue command_queue;

                cl_program program;

                cl_int err;

                cl_uint num_of_platforms=0;

                cl_platform_id platform_id;

                cl_device_id device_id;

                cl_uint num_of_devices=0;

                cl_mem input, output;

                size_t global;

       

                //////////////////////////////////////////

                cl_event event_queue1;

                cl_uint num_events_in_waitlist1 = 1;

                cl_event event_waitlist1[1] = { event_queue1 };

                //////////////////////////////////////////

       

                float inputData[DATA_SIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};

                float results[DATA_SIZE] = {0};

       

                int i;

       

                if (clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS) {

                          printf("Unable to get platform_id\n");

                          return 1;

                }

                else {

                          printf("[INFO]Able to get platform_id: %d\n", platform_id);

                          printf("[INFO]num_of_platforms       : %d\n\n", num_of_platforms);

                }

       

                // try to get a supported GPU device

                // When you get GPU device, use CL_DEVICE_TYPE_GPU flag.

                if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) {

                          printf("Unable to get GPU device!!\n");

                          // try to get a supported CPU device

                          // When you get GPU device, use CL_DEVICE_TYPE_CPU flag.

                          if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) {

                                    printf("Unable to get device!!\n");

                                    return 1;

                          }

                }

       

                // Now setup context properties - these context properties must be terminated with 0

                properties[0] = CL_CONTEXT_PLATFORM;

                properties[1] = (cl_context_properties) platform_id;

                properties[2] = 0; // properties is declared with array[3]. So, the last element of array must be set to 0.

       

                // create a context with the CPU device

                context = clCreateContext(properties, 1, &device_id, NULL, NULL, &err);

       

                // Create a command queue,

                command_queue = clCreateCommandQueue(context, device_id, 0, &err);

       

                // create a program from the kernel source code

                program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &err);

       

                // compile the program

                if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {

                          printf("[FATAL] Error building program\n");

                          return 1;

                }

                else {

                          printf("[INFO] Success to build program\n");

                }

       

                // specify which kernel from the program to execute

                kernel = clCreateKernel(program, "hello", &err);

       

                // create buffers for the input and output

                input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);

                output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);

       

                // load input data into input buffer

                clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputData, 0, NULL, NULL);

       

                // set the argument list for the kernel command

                clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);

                clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);

                global = DATA_SIZE;

       

                // enqueue the kernel command for execution

                clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event_queue1);

       

                // copy the results from out of the output buffer   

           clEnqueueReadBuffer(command_queue, output, CL_FALSE, 0, sizeof(float) * DATA_SIZE, results, num_events_in_waitlist1, event_waitlist1, NULL);

       

                clFinish(command_queue);

       

                // print the results

                printf("\noutput: ");

                for (i=0; i<DATA_SIZE; i++) {

                          printf("%f ", results[i]);

                }

                printf("\n\n");

       

                // cleanup - Release OpenCL resources

                clReleaseMemObject(input);

                clReleaseMemObject(output);

                clReleaseProgram(program);

                clReleaseKernel(kernel);

                clReleaseCommandQueue(command_queue);

                clReleaseContext(context);

       

                printf("Program complete!! terminate.\n");

       

                return 0;

      }

       

      //-----------------------------------------------------------------------------------------------------------------------

      I tried to find which line cause segfault but I can't.

      If someone have a idea for what's going on in this code. Please let me know.

      My environment is Fedora10, AMD APP SDK 2.6, and Catalyst 12.4.

       

      Thank you.

       

      P.S.  I have anther question.  What's the maximum number of command queues that can execute concurrently?