cancel
Showing results for 
Search instead for 
Did you mean: 

OpenCL

shreedhar_pawar
Adept II

Problem with the radix sort code written

Jump to solution

In the radix sort code that I have written, the second kernel is not working as want it to work...,  in the first kernel I have successfully managed to sort no.s ending with LSB 1 and LSB 0 into different arrays named g_ones and g_zero  for the first pass..(entire host code and kernel code is below) for the first pass.. In the second kernel , the elements of new_ones array(array obtained from g_ones in the host code by eliminating the invalid elements from g_ones) are not correctly transferred into the data array whereas the elements of new_zero  are correctly tranferred... I am reading both the arrays i.e. the data array and the new_ones array in my host code... the new_ones array gets printed properly but the data_array gets printed properly just for the transfer of new_zero array into it... The 2nd kernel is as follows and below that I have inserted the image of the output....

__kernel void radix2(__global int *data, __global int *new_ones,

          __global int *new_zero, int n)

{

          int id=get_global_id(0);

            if (id>=0 && id<n)             // n is the size of new_zero array.

                      data[id]=new_zero[id];

            else if (id>=n && id < 15)

                      data[id]=new_ones[n-id];

}

radix sort.PNG

I have tested all other parts in the code and I am sure that there isn't any problem with any of those parts... I think the problem may be with the 2nd kernel code itself ...

I am posting the entire code below...

HOST CODE

#include <iostream>

#include <fstream>

#include <sstream>

#ifdef __APPLE__

#include <OpenCL/cl.h>

#else

#include <CL/cl.h>

#endif

//  Create an OpenCL context on the first available platform using

//  either a GPU or CPU depending on what is available.

//

cl_context CreateContext()

{

    cl_int errNum;

    cl_uint numPlatforms;

    cl_platform_id firstPlatformId;

    cl_context context = NULL;

    errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);

    if (errNum != CL_SUCCESS || numPlatforms <= 0)

    {

        std::cerr << "Failed to find any OpenCL platforms." << std::endl;

        return NULL;

    }

    cl_context_properties contextProperties[] =

    {

        CL_CONTEXT_PLATFORM,

        (cl_context_properties)firstPlatformId,

        0

    };

    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,

                                      NULL, NULL, &errNum);

    if (errNum != CL_SUCCESS)

    {

        std::cout << "Could not create GPU context, trying CPU..." << std::endl;

        context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,

                                          NULL, NULL, &errNum);

        if (errNum != CL_SUCCESS)

        {

            std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;

            return NULL;

        }

    }

    return context;

}

///

//  Create a command queue on the first device available on the

//  context

//

cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)

{

    cl_int errNum;

    cl_device_id *devices;

    cl_command_queue commandQueue = NULL;

    size_t deviceBufferSize = -1;

    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);

    if (errNum != CL_SUCCESS)

    {

        std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";

        return NULL;

    }

    if (deviceBufferSize <= 0)

    {

        std::cerr << "No devices available.";

        return NULL;

    }

    devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];

    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);

    if (errNum != CL_SUCCESS)

    {

        delete [] devices;

        std::cerr << "Failed to get device IDs";

        return NULL;

    }

    commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);

    if (commandQueue == NULL)

    {

        delete [] devices;

        std::cerr << "Failed to create commandQueue for device 0";

        return NULL;

    }

    *device = devices[0];

    delete [] devices;

    return commandQueue;

}

///

//  Create an OpenCL program from the kernel source file

//

cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)

{

    cl_int errNum;

    cl_program program;

std::ifstream kernelFile(fileName, std::ios::in);

    if (!kernelFile.is_open())

    {

        std::cerr << "Failed to open file for reading: " << fileName << std::endl;

        return NULL;

    }

    std::ostringstream oss;

    oss << kernelFile.rdbuf();

    std::string srcStdStr = oss.str();

    const char *srcStr = srcStdStr.c_str();

    program = clCreateProgramWithSource(context, 1,

                                        (const char**)&srcStr,

                                        NULL, NULL);

    if (program == NULL)

    {

        std::cerr << "Failed to create CL program from source." << std::endl;

        return NULL;

    }

    errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    if (errNum != CL_SUCCESS)

    {

        // Determine the reason for the error

        char buildLog[16384];

        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,

                              sizeof(buildLog), buildLog, NULL);

        std::cerr << "Error in kernel: " << std::endl;

        std::cerr << buildLog;

                    system("pause");

        clReleaseProgram(program);

        return NULL;

    }

    return program;

}

//  Cleanup any created OpenCL resources

//

void Cleanup(cl_context context, cl_command_queue commandQueue,

             cl_program program, cl_kernel kernel1,  cl_kernel kernel2 )

{

    if (commandQueue != 0)

        clReleaseCommandQueue(commandQueue);

    if (kernel1 != 0)

        clReleaseKernel(kernel1);

          if (kernel2 != 0)

        clReleaseKernel(kernel2);

    if (program != 0)

        clReleaseProgram(program);

    if (context != 0)

        clReleaseContext(context);

}

///

//      main() for radix sort

//

int main(int argc, char** argv)

{

    cl_context context = 0;

    cl_command_queue commandQueue = 0;

    cl_program program = 0;

    cl_device_id device = 0;

    cl_kernel kernel1 = 0;

          cl_kernel kernel2 = 0;

          cl_mem memObject[3] = {0,0,0};

    cl_int errNum;

    // Create an OpenCL context on first available platform

    context = CreateContext();

    if (context == NULL)

    {

        std::cerr << "Failed to create OpenCL context." << std::endl;

        return 1;

    }

    // Create a command-queue on the first device available

    // on the created context

    commandQueue = CreateCommandQueue(context, &device);

    if (commandQueue == NULL)

    {

         Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

    // Create OpenCL program from HelloWorld.cl kernel source

    program = CreateProgram(context, device, "HelloWorld.cl");

    if (program == NULL)

    {

      Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

    // Create OpenCL kernel

    kernel1 = clCreateKernel(program, "radix1", NULL);

    if (kernel1 == NULL)

    {

        std::cerr << "Failed to create kernel" << std::endl;

        Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

           kernel2 = clCreateKernel(program, "radix2", NULL);

    if (kernel2 == NULL)

    {

        std::cerr << "Failed to create kernel" << std::endl;

        Cleanup(context, commandQueue, program, kernel1,kernel2 );

        return 1;

    }

   int *data =(int*)malloc(sizeof(int)*15);

           int *g_ones=(int*)malloc(sizeof(int)*15);

            int *g_zero  =(int*)malloc(sizeof(int)*15);

          int j,k,q,p,cmp_val;

           for (int i = 0; i < 15; i++)

    {

        data=i;

    }

          cmp_val=1;          

           size_t globalWorkSize[1] = { 15};

    size_t localWorkSize[1] = { 5 };

          memObject[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,

                              sizeof(int) * 15, data, NULL);

                    memObject[1] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

                              sizeof(int) * 15, NULL, NULL);

                    memObject[2] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

                              sizeof(int) * 15, NULL, NULL);

// Actual sequential loop for radix sort for the bit pass...

                    for(int i=0;i<4;i++)

          {

                    j=0;k=0;

                    int m,n;

                    m=0;n=0;

            errNum = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &memObject[0]);

                                        errNum = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &memObject[1]);

                                                            errNum = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &memObject[2]);

                                                            errNum = clSetKernelArg(kernel1, 3, sizeof(int), &cmp_val);

                    if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error setting kernel arguments." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

                    errNum = clEnqueueNDRangeKernel(commandQueue, kernel1, 1, NULL,

                              globalWorkSize, localWorkSize,

                              0, NULL, NULL);

        if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error queuing kernel one for execution." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[1], CL_TRUE,

                                 0, 15 * sizeof(int), g_ones,

                                 0, NULL, NULL);

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[2], CL_TRUE,

                                 0, 15 * sizeof(int), g_zero,

                                 0, NULL, NULL);

        if (errNum != CL_SUCCESS)

        {

          std::cerr << "Error reading result buffers." << std::endl;

          Cleanup(context, commandQueue, program, kernel1,kernel2 );

           return 1;

        }

              for( p=0;p<15;p++)

              {

                       if (g_ones

!=-1)

                       {

                                 m++;

                       }

                       if(g_zero

!=-1)

                       {

                                 n++;

                       }

              }

    int *new_ones =(int*)malloc(sizeof(int)*m);

          int *new_zero=(int*)malloc(sizeof(int)*n);

             for( p=0;p<15;p++)

             {

                       if (g_ones

!=-1)

                       {

                                 new_ones[j++]=g_ones

;

                       }

                       if(g_zero

!=-1)

                       {

                                 new_zero[k++]=g_zero

;

                       }

             }

errNum=          clEnqueueWriteBuffer(commandQueue,           memObject[1], CL_TRUE, 0,

sizeof(int) * m,new_ones, 0, NULL, NULL);

errNum=clEnqueueWriteBuffer(commandQueue,           memObject[2], CL_TRUE, 0,

sizeof(int) * n,new_zero, 0, NULL, NULL);

          if (errNum != CL_SUCCESS)

        {

          std::cerr << "Error writing new ones and zero buffer." << std::endl;

          Cleanup(context, commandQueue, program, kernel1,kernel2 );

           return 1;

        }

                    errNum = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &memObject[0]);

                                        errNum = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &memObject[1]);

                                                            errNum = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &memObject[2]);

                                                            errNum = clSetKernelArg(kernel2, 3, sizeof(int), &n);

                    if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error setting kernel arguments." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

                    errNum = clEnqueueNDRangeKernel(commandQueue, kernel2, 1, NULL,

                              globalWorkSize, localWorkSize,

                              0, NULL, NULL);

                    if (errNum != CL_SUCCESS)

                     {

                              std::cerr << "Error queuing kernel two for execution." << std::endl;

                              Cleanup(context, commandQueue, program, kernel1,kernel2 );

                              return 1;

                     }

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[0], CL_TRUE,

                                 0, 15 * sizeof(int), data,

                                 0, NULL, NULL);

                    errNum = clEnqueueReadBuffer(commandQueue, memObject[1], CL_TRUE,

                                 0, m * sizeof(int), new_ones,

                                 0, NULL, NULL);

       errNum = clEnqueueReadBuffer(commandQueue, memObject[2], CL_TRUE,

                                 0, m * sizeof(int), new_zero,

                                 0, NULL, NULL);

       if (errNum != CL_SUCCESS)

        {

          std::cerr << "Error reading final buffer." << std::endl;

          Cleanup(context, commandQueue, program, kernel1,kernel2 );

           return 1;

        }

              std::cout << "new_ones" <<std::endl;

                    for (int i = 0; i < m; i++)

    {

        std::cout << new_ones << " ";

    }

                    std::cout  <<std::endl;

                      std::cout << "new_zero" <<std::endl;

                    for (int i = 0; i < m; i++)

    {

        std::cout << new_zero << " ";

    }

                    std::cout  <<std::endl;

              std::cout << "data" <<std::endl;

                    for (int i = 0; i < 15; i++)

    {

        std::cout << data << " ";

    }

                    system("pause");

              cmp_val<<=1;

                    delete[]new_zero;

                              delete[]new_ones;

          }

}

Full Kernel Code

__kernel void radix1(__global int *data,__global int *g_ones,

          __global int *g_zero, int cmp_value )

{

      int gid=get_global_id(0);

            if(data[gid] & cmp_value)

            {

                      g_ones[gid]=data[gid];

                      g_zero[gid]=-1;

            }

            else

      {

                      g_ones[gid]=-1;

                      g_zero[gid]=data[gid];

            }

}

__kernel void radix2(__global int *data, __global int *new_ones,

          __global int *new_zero, int n)

{

          int id=get_global_id(0);

            if (id>=0 && id<n)

                      data[id]=new_zero[id];

            else if (id>=n && id < 15)

                      data[id]=new_ones[n-id];

}

0 Likes
Reply
1 Solution

Accepted Solutions
himanshu_gautam
Grandmaster

Re: Problem with the radix sort code written

Jump to solution

The problem is when "id >=n" and "id <15" -- You are using "n-id" which is a -ve number. You should use "id-n".

View solution in original post

0 Likes
Reply
14 Replies
himanshu_gautam
Grandmaster

Re: Problem with the radix sort code written

Jump to solution

The problem is when "id >=n" and "id <15" -- You are using "n-id" which is a -ve number. You should use "id-n".

View solution in original post

0 Likes
Reply
shreedhar_pawar
Adept II

Re: Problem with the radix sort code written

Jump to solution

Yeah Himanshu it fixed the problem... actually really silly of me... But ..Thanx... a big one..!

0 Likes
Reply
shreedhar_pawar
Adept II

Re: Problem with the radix sort code written

Jump to solution

As you observe in this code... I go back to the host code just for finding the sizes of two arrays or rather for counting the invalid elements in those arrays and then come back to the kernel by giving a second kernel...! Can't I do this counting in a parallel way.....???

0 Likes
Reply
himanshu_gautam
Grandmaster

Re: Problem with the radix sort code written

Jump to solution

You can use "atomics" and get this done.

Now, inside your kernel, maintain a 2 local arrays[] which will actually be the "zeros" and "ones" array....
Globally as well, maintain 2 arrays[] g_ones and g_zeroes (allocate them to be as big as your data[] array)
Assuming 256 is your workgroup size:

volatile __local int zeroCounter, oneCounter;

__local int l_zeroes[256];

__local int l_ones[256];

..... initialize zeroCounter and oneCounter to 0......


if(data[gid] & cmp_value)
{
    int lval = atomic_inc(oneCounter); // Atomically increments counter and returns old value.

    l_ones[lval]=data[gid];

} else  {
    int lval = atomic_inc(zeroCounter); // Atomically increments and returns old value.

    l_zeroes[lval]=data[gid];

}
barrier(CLK_LOCAL_MEM_FENCE);

//zeroCounter and oneCounter will hold the total valid elements in l_zeroes and l_ones array.
//Now, we need to dump this location in g_ones[] and g_zeroes[] arrays.


int gPos;
if (get_local_id(0) == 0) {
    int localCounter = oneCounter;
    gPos = atomic_add(globalOneCounter, localCounter);  
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int tid = get_local_id(0); tid<oneCounter; tid+=get_local_size(0))
{
       g_ones[gPos + tid] = l_ones[tid];
}
barrier(CLK_LOCAL_MEM_FENCE);

//...Similarly copy out the Zero array as well to g_zeroes[] global array
//...
} -- end of kernel

// Now, global one counter and zero counter will hold the number of elements in g_ones and g_zeroes array
// Now Launch one another kernel that will copy out "g_ones" and "g_zeroes" elements back to "data".
0 Likes
Reply
bsp2020
Elite

Re: Problem with the radix sort code written

Jump to solution

I suggest that you take some parallel programming class. I recommend https://www.udacity.com/course/cs344

Though your code is written in OpenCL, it is not really doing the hard work in parallel.

shreedhar_pawar
Adept II

Re: Problem with the radix sort code written

Jump to solution

I actually modified my kernel code by setting some local arguments but i am getting a runtime error in queuing the kernel for execution, I have created more 3 mem objects as local arrays and also set them as kernel args... I am writing the modified part of the code below...!

Host Code

  memObject[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,

  sizeof( unsigned int) * 1024, data, NULL);

  memObject[1] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

  sizeof(unsigned int) * 1024, NULL, NULL);

  memObject[2] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

  sizeof( unsigned int) * 1024, NULL, NULL);

  memObject[3] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

  sizeof( unsigned int) * 1024, NULL, NULL);

  memObject[4] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

  sizeof(unsigned int) * 1024, NULL, NULL);

  memObject[5] = clCreateBuffer(context, CL_MEM_READ_WRITE ,

  sizeof( unsigned int) * 1024, NULL, NULL);

  for(unsigned int i=0;i<18;i++)

  {

  j=0;k=0;

  unsigned int m,n;

  m=0;n=0;

  errNum = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &memObject[0]);

  errNum = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &memObject[1]);

  errNum = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &memObject[2]);

  errNum = clSetKernelArg(kernel1, 3, sizeof(cl_mem), &memObject[3]);

  errNum = clSetKernelArg(kernel1, 4, sizeof(cl_mem), &memObject[4]);

  errNum = clSetKernelArg(kernel1, 5, sizeof(cl_mem), &memObject[5]);

  errNum = clSetKernelArg(kernel1, 6, sizeof(unsigned int), &cmp_val);

  if (errNum != CL_SUCCESS)

  {

  std::cerr << "Error setting kernel arguments." << std::endl;

  Cleanup(context, commandQueue, program, kernel1,kernel2 );

  return 1;

  }

  errNum = clEnqueueNDRangeKernel(commandQueue, kernel1, 1, NULL,

  globalWorkSize, localWorkSize,

  0, NULL, NULL);

        if (errNum != CL_SUCCESS)

  {

  std::cerr << "Error queuing kernel one for execution." << std::endl;

  Cleanup(context, commandQueue, program, kernel1,kernel2 );

  return 1;

  }

Modified Kernel Code

_kernel void radix1(__global uint *g_data,__global uint *g_ones,

  __global uint *g_zero, __local uint *data, __local uint *l_ones, __local uint* l_zero, uint cmp_value)

{

 

  uint tid;

// uint l_ones[1024],l_zero[1024];

  uint id=get_global_id(0);

  data[id]=g_data[id];

  barrier(CLK_LOCAL_MEM_FENCE);

  uint gid=get_global_id(0);

   if(data[gid] & cmp_value)

   {

   l_ones[gid]=data[gid];

   l_zero[gid]=-1;

   }

   else

      {

   l_ones[gid]=-1;

   l_zero[gid]=data[gid];

   }

   barrier(CLK_LOCAL_MEM_FENCE);

  tid=get_global_id(0);

    g_ones[tid]=l_ones[tid];

    g_zero[tid]=l_zero[tid];

  

}

What is the actual fault...?


0 Likes
Reply
shreedhar_pawar
Adept II

Re: Problem with the radix sort code written

Jump to solution

Hi bsp2020,

  I know that my code is not doing much part in parallel... I guess you mean the part that counts the sizes of two arrays for which I come back to the host code and then again declare the 2nd kernel...! Or is there any other part of the code which you say is not efficient...?

Also the udacity course is based on CUDA I guess not on OpenCL...! Will I benefit from it..?

0 Likes
Reply
himanshu_gautam
Grandmaster

Re: Problem with the radix sort code written

Jump to solution

You are allocating 5*4K = 20K of local memory. I think you are using NVIDIA card and hence you will have only 16K of shared memory out there....(+32K of L1 -- if you are FERMI+).

So, this will not work....

btw, why do you need so many local buffers? In my example, I had used only 2 local buffers -- that too small ones.

What are you trying to do?

0 Likes
Reply
shreedhar_pawar
Adept II

Re: Problem with the radix sort code written

Jump to solution

Hi Himanshu,

  As you had told me to do atomic operations with local memory, so I declared the l_ones , l_zero array as local memories... As the no. of elements that I want to sort is more so is their size... If not in this way how do I do the local atomic operations...?

0 Likes
Reply