5 Replies Latest reply on May 19, 2010 9:44 AM by pavandsp

    having problem  in using _local  in kernel

    pavandsp

      Hi,

      I have a problem in using local memory in kernel to optimize the performance.

      I am not getting output from kernel.Please rectify me if i am doing anything weired with local qualifier.output buffer size=1280x720x3.

      global thread=1280x720

      local thread =16x16

      GPU:ATI HD 5850,sdk 2.1, ubuntu 9.10.

       

       

      #pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable #define LOCAL_ROW_SIZE 16 #define LOCAL_COL_SIZE 16 __kernel void kernel(__global unsigned char * output, const int rc,const int gc,const int bc , const int ravg,const int gavg,const int bavg, const int ra,const int ga,const int ba) { int B[3]; uint j; __local unsigned char local_temp[LOCAL_ROW_SIZE][LOCAL_COL_SIZE*3]; uint tx = get_global_id(0) * 3; uint ty = get_global_id(1) * 3; uint lx = get_local_id(0); uint ly = get_local_id(1); uint gx = get_group_id(0); uint gy = get_group_id(1); local_temp[ly][lx * 3 + 0] = output[ty * 1280 + tx + 0]; local_temp[ly][lx * 3 + 1] = output[ty * 1280 + tx + 1]; local_temp[ly][lx * 3 + 2] = output[ty * 1280 + tx + 2]; barrier(CLK_LOCAL_MEM_FENCE); B[0] = ((local_temp[ly][lx * 3 + 0] - rc) * ravg) + ra; B[1] = ((local_temp[ly][lx * 3 + 1] - gc) * gavg) + ga; B[2] = ((local_temp[ly][lx * 3 + 2] - bc) * bavg) + ba; for(j=0;j<3;j++) { if(B[j] < 0) B[j] = 0; if(B[j] > 255) B[j] = 255; output[ty * 1280 + tx + j ] = B[j]; } }

        • having problem  in using _local  in kernel
          pavandsp

          Hi

          The Kernel is working in CPU but i am worried why it is not working in GPU.Please let me know if i have to change something for GPU

          Thanks in Advance

            • having problem  in using _local  in kernel
              omkaranathan

              What do you mean by no output? Are you getting some error? or incorrect output?

              Please make sure you are doing proper error checking.

              Also its easy to track down the problem if you give the runtime code too.(A compilable testcase).

                • having problem  in using _local  in kernel
                  pavandsp

                  Hi Omkar,

                  The Ouput buffer is NULL(all zeros) from kernel in GPU seems some issue in using __local wherein data from local mem is not copied to output buffer or vise versa.

                  this kernel is part of a big project .guess may not be feasable to send  the complete code and setup.  

                  I have also used the following code before barrier() compared to previous in kernel  but output is still NULL ...is there any atomic issues like calculate,read,write  and so on.I am cluless why it is not working in GPU.

                  local_temp[ly][lx * 3 + 0] = output[(gy * 1280 * 3 * WG_HEIGHT_SIZE ) + (gx * 3 * WG_WIDTH_SIZE ) + ((ly * 3 * 1280) + lx * 3 +  0) ];
                          local_temp[ly][lx * 3 + 1] = output[(gy * 1280 * 3 * WG_HEIGHT_SIZE ) + (gx * 3 * WG_WIDTH_SIZE ) + ((ly * 3 * 1280) + lx * 3 +  1) ];
                          local_temp[ly][lx * 3 + 2] = output[(gy * 1280 * 3 * WG_HEIGHT_SIZE ) + (gx * 3 * WG_WIDTH_SIZE ) + ((ly * 3 * 1280) + lx * 3 +  2) ];

                  Actaully I am trying to modify the below kernel in terms of __local usage and for understanding the concept so that i can apply __local mem for performance in other part of program.

                  **********************************************

                  __kernel void kernel(__global  unsigned char * output,
                                          const int rc,const int gc,const int bc ,
                                          const int ravg,const int gavg,const int bavg,
                                          const int ra,const int ga,const int ba)
                  {

                          int  B[3];
                          uint j;
                          uint tx = get_global_id(0) * 3;

                           B[0] = ((output[tx] - rc) * ravg) + ra;
                          B[1] = ((output[tx+1] - gc) * gavg) + ga;
                          B[2] = ((output[tx+2] - bc) * bavg) + ba;

                           for(j=0;j<3;j++)
                          {
                                  if(B[j] < 0) B[j] = 0;
                                  if(B[j] > 255) B[j] = 255;
                                       output[tx+j] = B[j];
                                   }


                  }

                  ***********************************
                  Thanks

                  Pavan