zhuzxy

Can anyone give me some advice on the Opencl ON CPU using vectors?

Discussion created by zhuzxy on Oct 17, 2011
Latest reply on Oct 17, 2011 by notzed

I am hoping the Opencl vector can improve algorithm a lot, but it seems my opencl code is more than 50% slower than native C code. Can anyone give me some advice?

my sample cl code like the following:

__kernel  void testvector_kernel_cpu(
            __global uchar* img,   
            uint width,                   
            uint height,                  
            __global uint *param1,
            __global uint * param2
           )
{

   int  wg1 = get_global_id (1);
   int x_val, y_val;

   x_val = param1[wg1*2];
   y_val = param1[wg1*2 + 1];
   int cpx = x_val+ y_val * width;
   int W = 19;
    int8 dv;
      //ring1 , 6
    dv = (int8) (img[cpx +0+2*W] - img[cpx +0-2*W],   
                    img[cpx +1+2*W] - img[cpx -1-2*W],
                    img[cpx +2+1*W] - img[cpx -2-1*W],
                    img[cpx +2+0*W] - img[cpx -2+0*W],
                    img[cpx +2-1*W] - img[cpx -2+1*W],
                    img[cpx +1-2*W] - img[cpx -1+2*W],
      //ring2 , 8
                    img[cpx +0+3*W] - img[cpx +0-3*W],
                    img[cpx +1+3*W] - img[cpx -1-3*W]);

    int8 dx0 = (int8)( 0,14,28,32,28,14,0,10) * dv;
    int8 dy0 = (int8) ( 32,28,14,0,-14,-28,32,30) * dv;


    int dx = dx0.s0 + dx0.s1 + dx0.s2 + dx0.s3 + dx0.s4 + dx0.s5 + dx0.s6 + dx0.s7;
    int dy = dy0.s0 + dy0.s1 + dy0.s2+ dy0.s3 + dy0.s4 + dy0.s5 + dy0.s6 + dy0.s7;

    param2[wg1] = dx/dy;
  
}

 

another piece of code like the following:

 __kernel  void cal_grad_kernel_cpu(
            __global uchar* img,    
            uint width,             
            uint height,            
            __global uint *param1,  
            __global uint * param2, 
            __global uchar* patch_img

           )
{
    int wg = get_global_id(0);
    int region_offset = wg * width;
    int patch_offset = wg * width*2;
    int W = 80;
    for (int i = 0; i < 2 * 72; i++)
    {
       int newsrc_pos = ( param1[wg]);
       int newsrc_pos1 = ( param1[wg * 8 +1]);
       // Is this calculation worth?, or just do a memory copy to 25x25.
       int4 tmppix = ((int4)( img[region_offset + newsrc_pos + 1] - img[region_offset + newsrc_pos],
                             img[region_offset + newsrc_pos + W+1] - img[region_offset + newsrc_pos + W],
                             img[region_offset + newsrc_pos1 + 1] - img[region_offset + newsrc_pos1],
                             img[region_offset + newsrc_pos1 + W+1] - img[region_offset + newsrc_pos1 + W])
                     * (int4)(param2[wg],param2[wg+1], param2[wg+2], param2[wg+3]))>>8;

       tmppix = (int4)( img[region_offset + newsrc_pos], img[region_offset + newsrc_pos + W],img[region_offset + newsrc_pos1], img[region_offset + newsrc_pos1 + W]) + (tmppix);

       patch_img[patch_offset] =( unsigned char )( tmppix.x + ((( tmppix.y - tmppix.x)*param2[wg*2])>>8));
       patch_img[patch_offset + 1] =( unsigned char )( tmppix.z + ((( tmppix.w - tmppix.z)*param2[wg*2 +2])>>8));
       patch_offset += 2;

    }

}

Outcomes