AnsweredAssumed Answered

Problem with the radix sort code written

Question asked by shreedhar_pawar on Apr 11, 2013
Latest reply on Apr 12, 2013 by shreedhar_pawar

In the radix sort code that I have written, the second kernel is not working as want it to work...,  in the first kernel I have successfully managed to sort no.s ending with LSB 1 and LSB 0 into different arrays named g_ones and g_zero  for the first pass..(entire host code and kernel code is below) for the first pass.. In the second kernel , the elements of new_ones array(array obtained from g_ones in the host code by eliminating the invalid elements from g_ones) are not correctly transferred into the data array whereas the elements of new_zero  are correctly tranferred... I am reading both the arrays i.e. the data array and the new_ones array in my host code... the new_ones array gets printed properly but the data_array gets printed properly just for the transfer of new_zero array into it... The 2nd kernel is as follows and below that I have inserted the image of the output....

 

__kernel void radix2(__global int *data, __global int *new_ones,

          __global int *new_zero, int n)

{

          int id=get_global_id(0);

            if (id>=0 && id<n)             // n is the size of new_zero array.

                      data[id]=new_zero[id];

            else if (id>=n && id < 15)

                      data[id]=new_ones[n-id];

}

 

 

radix sort.PNG

I have tested all other parts in the code and I am sure that there isn't any problem with any of those parts... I think the problem may be with the 2nd kernel code itself ...

I am posting the entire code below...

 

HOST CODE

 

#include <iostream>

#include <fstream>

#include <sstream>

 

 

#ifdef __APPLE__

#include <OpenCL/cl.h>

#else

#include <CL/cl.h>

#endif

 

//  Create an OpenCL context on the first available platform using

//  either a GPU or CPU depending on what is available.

//

cl_context CreateContext()

{

    cl_int errNum;

    cl_uint numPlatforms;

    cl_platform_id firstPlatformId;

    cl_context context = NULL;

 

    errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);

    if (errNum != CL_SUCCESS || numPlatforms <= 0)

    {

        std::cerr << "Failed to find any OpenCL platforms." << std::endl;

        return NULL;

    }

 

    cl_context_properties contextProperties[] =

    {

        CL_CONTEXT_PLATFORM,

        (cl_context_properties)firstPlatformId,

        0

    };

    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,

                                      NULL, NULL, &errNum);

    if (errNum != CL_SUCCESS)

    {

        std::cout << "Could not create GPU context, trying CPU..." << std::endl;

        context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,

                                          NULL, NULL, &errNum);

        if (errNum != CL_SUCCESS)

        {

            std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;

            return NULL;

        }

    }

    return context;

}

 

 

///

//  Create a command queue on the first device available on the

//  context

//

cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)

{

    cl_int errNum;

    cl_device_id *devices;

    cl_command_queue commandQueue = NULL;

    size_t deviceBufferSize = -1;

 

    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);

    if (errNum != CL_SUCCESS)

    {

        std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";

        return NULL;

    }

 

 

    if (deviceBufferSize <= 0)

    {

        std::cerr << "No devices available.";

        return NULL;

    }

 

    devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];

    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);

    if (errNum != CL_SUCCESS)

    {

        delete [] devices;

        std::cerr << "Failed to get device IDs";

        return NULL;

    }

 

    commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);

    if (commandQueue == NULL)

    {

        delete [] devices;

        std::cerr << "Failed to create commandQueue for device 0";

        return NULL;

    }

    *device = devices[0];

    delete [] devices;

    return commandQueue;

}

 

 

///

//  Create an OpenCL program from the kernel source file

//

cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)

{

    cl_int errNum;

    cl_program program;

std::ifstream kernelFile(fileName, std::ios::in);

    if (!kernelFile.is_open())

    {

        std::cerr << "Failed to open file for reading: " << fileName << std::endl;

        return NULL;

    }

    std::ostringstream oss;

    oss << kernelFile.rdbuf();

    std::string srcStdStr = oss.str();

    const char *srcStr = srcStdStr.c_str();

    program = clCreateProgramWithSource(context, 1,

                                        (const char**)&srcStr,

                                        NULL, NULL);

    if (program == NULL)

    {

        std::cerr << "Failed to create CL program from source." << std::endl;

        return NULL;

    }

    errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    if (errNum != CL_SUCCESS)

    {

        // Determine the reason for the error

        char buildLog[16384];

        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,

                              sizeof(buildLog), buildLog, NULL);

        std::cerr << "Error in kernel: " << std::endl;

        std::cerr << buildLog;

                    system("pause");

        clReleaseProgram(program);

        return NULL;

    }

    return program;

}

 

//  Cleanup any created OpenCL resources

//

void Cleanup(cl_context context, cl_command_queue commandQueue,

             cl_program program, cl_kernel kernel1,  cl_kernel kernel2 )

{

    if (commandQueue != 0)

        clReleaseCommandQueue(commandQueue);

    if (kernel1 != 0)

        clReleaseKernel(kernel1);

          if (kernel2 != 0)

        clReleaseKernel(kernel2);

    if (program != 0)

        clReleaseProgram(program);

    if (context != 0)

        clReleaseContext(context);

}

 

 

///

//      main() for radix sort

//

int main(int argc, char** argv)

{

    cl_context context = 0;

    cl_command_queue commandQueue = 0;

    cl_program program = 0;

    cl_device_id device = 0;

    cl_kernel kernel1 = 0;

          cl_kernel kernel2 = 0;

          cl_mem memObject[3] = {0,0,0};

    cl_int errNum;

 

 

    // Create an OpenCL context on first available platform

    context = CreateContext();

    if (context == NULL)

    {

        std::cerr << "Failed to create OpenCL context." << std::endl;

        return 1;

    }

 

 

    // Create a command-queue on the first device available

    // on the created context

    commandQueue = CreateCommandQueue(context, &device);

    if (commandQueue == NULL)

    {

         Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

 

 

    // Create OpenCL program from HelloWorld.cl kernel source

    program = CreateProgram(context, device, "HelloWorld.cl");

    if (program == NULL)

    {

      Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

 

 

    // Create OpenCL kernel

    kernel1 = clCreateKernel(program, "radix1", NULL);

    if (kernel1 == NULL)

    {

        std::cerr << "Failed to create kernel" << std::endl;

        Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

 

 

           kernel2 = clCreateKernel(program, "radix2", NULL);

    if (kernel2 == NULL)

    {

        std::cerr << "Failed to create kernel" << std::endl;

        Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

 

   int *data =(int*)malloc(sizeof(int)*15);

           int *g_ones=(int*)malloc(sizeof(int)*15);

            int *g_zero  =(int*)malloc(sizeof(int)*15);

 

 

 

          int j,k,q,p,cmp_val;

           for (int i = 0; i < 15; i++)

    {

        data[i]=i;

    }

          cmp_val=1;          

 

           size_t globalWorkSize[1] = { 15};

    size_t localWorkSize[1] = { 5 };

 

 

          memObject[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,

                              sizeof(int) * 15, data, NULL);

                    memObject[1] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

                              sizeof(int) * 15, NULL, NULL);

                    memObject[2] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

                              sizeof(int) * 15, NULL, NULL);

 

 

// Actual sequential loop for radix sort for the bit pass...

 

                    for(int i=0;i<4;i++)

          {

                    j=0;k=0;

                    int m,n;

                    m=0;n=0;

 

            errNum = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &memObject[0]);

                                        errNum = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &memObject[1]);

                                                            errNum = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &memObject[2]);

                                                            errNum = clSetKernelArg(kernel1, 3, sizeof(int), &cmp_val);

                    if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error setting kernel arguments." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

 

 

 

 

                    errNum = clEnqueueNDRangeKernel(commandQueue, kernel1, 1, NULL,

                              globalWorkSize, localWorkSize,

                              0, NULL, NULL);

        if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error queuing kernel one for execution." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

 

 

 

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[1], CL_TRUE,

                                 0, 15 * sizeof(int), g_ones,

                                 0, NULL, NULL);

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[2], CL_TRUE,

                                 0, 15 * sizeof(int), g_zero,

                                 0, NULL, NULL);

        if (errNum != CL_SUCCESS)

        {

          std::cerr << "Error reading result buffers." << std::endl;

          Cleanup(context, commandQueue, program, kernel1,kernel2 );

           return 1;

        }

 

 

              for( p=0;p<15;p++)

              {

                       if (g_ones[p]!=-1)

                       {

                                 m++;

                       }

                       if(g_zero[p]!=-1)

                       {

                                 n++;

                       }

              }

    int *new_ones =(int*)malloc(sizeof(int)*m);

          int *new_zero=(int*)malloc(sizeof(int)*n);

 

 

             for( p=0;p<15;p++)

             {

                       if (g_ones[p]!=-1)

                       {

                                 new_ones[j++]=g_ones[p];

                       }

                       if(g_zero[p]!=-1)

                       {

                                 new_zero[k++]=g_zero[p];

                       }

             }

 

 

 

 

 

errNum=          clEnqueueWriteBuffer(commandQueue,           memObject[1], CL_TRUE, 0,

sizeof(int) * m,new_ones, 0, NULL, NULL);

errNum=clEnqueueWriteBuffer(commandQueue,           memObject[2], CL_TRUE, 0,

sizeof(int) * n,new_zero, 0, NULL, NULL);

          if (errNum != CL_SUCCESS)

        {

          std::cerr << "Error writing new ones and zero buffer." << std::endl;

          Cleanup(context, commandQueue, program, kernel1,kernel2 );

           return 1;

        }

 

 

                    errNum = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &memObject[0]);

                                        errNum = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &memObject[1]);

                                                            errNum = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &memObject[2]);

 

                                                            errNum = clSetKernelArg(kernel2, 3, sizeof(int), &n);

                    if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error setting kernel arguments." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

                    errNum = clEnqueueNDRangeKernel(commandQueue, kernel2, 1, NULL,

                              globalWorkSize, localWorkSize,

                              0, NULL, NULL);

 

 

                    if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error queuing kernel two for execution." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

 

 

 

 

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[0], CL_TRUE,

                                 0, 15 * sizeof(int), data,

                                 0, NULL, NULL);

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[1], CL_TRUE,

                                 0, m * sizeof(int), new_ones,

                                 0, NULL, NULL);

       errNum = clEnqueueReadBuffer(commandQueue, memObject[2], CL_TRUE,

                                 0, m * sizeof(int), new_zero,

                                 0, NULL, NULL);

       if (errNum != CL_SUCCESS)

        {

          std::cerr << "Error reading final buffer." << std::endl;

          Cleanup(context, commandQueue, program, kernel1,kernel2 );

           return 1;

        }

 

 

              std::cout << "new_ones" <<std::endl;

                    for (int i = 0; i < m; i++)

    {

        std::cout << new_ones[i] << " ";

    }

                    std::cout  <<std::endl;

                      std::cout << "new_zero" <<std::endl;

                    for (int i = 0; i < m; i++)

    {

        std::cout << new_zero[i] << " ";

    }

                    std::cout  <<std::endl;

              std::cout << "data" <<std::endl;

                    for (int i = 0; i < 15; i++)

    {

        std::cout << data[i] << " ";

    }

                    system("pause");

              cmp_val<<=1;

                    delete[]new_zero;

                              delete[]new_ones;

          }

 

}

 

Full Kernel Code

 

__kernel void radix1(__global int *data,__global int *g_ones,

          __global int *g_zero, int cmp_value )

{

      int gid=get_global_id(0);

            if(data[gid] & cmp_value)

            {

                      g_ones[gid]=data[gid];

                      g_zero[gid]=-1;

            }

            else

      {

                      g_ones[gid]=-1;

                      g_zero[gid]=data[gid];

            }

 

 

}

 

 

__kernel void radix2(__global int *data, __global int *new_ones,

          __global int *new_zero, int n)

{

          int id=get_global_id(0);

            if (id>=0 && id<n)

                      data[id]=new_zero[id];

            else if (id>=n && id < 15)

                      data[id]=new_ones[n-id];

}

Outcomes