I am hoping the Opencl vector can improve algorithm a lot, but it seems my opencl code is more than 50% slower than native C code. Can anyone give me some advice?
my sample cl code like the following:
__kernel void testvector_kernel_cpu(
__global uchar* img,
uint width,
uint height,
__global uint *param1,
__global uint * param2
)
{
int wg1 = get_global_id (1);
int x_val, y_val;
x_val = param1[wg1*2];
y_val = param1[wg1*2 + 1];
int cpx = x_val+ y_val * width;
int W = 19;
int8 dv;
//ring1 , 6
dv = (int8) (img[cpx +0+2*W] - img[cpx +0-2*W],
img[cpx +1+2*W] - img[cpx -1-2*W],
img[cpx +2+1*W] - img[cpx -2-1*W],
img[cpx +2+0*W] - img[cpx -2+0*W],
img[cpx +2-1*W] - img[cpx -2+1*W],
img[cpx +1-2*W] - img[cpx -1+2*W],
//ring2 , 8
img[cpx +0+3*W] - img[cpx +0-3*W],
img[cpx +1+3*W] - img[cpx -1-3*W]);
int8 dx0 = (int8)( 0,14,28,32,28,14,0,10) * dv;
int8 dy0 = (int8) ( 32,28,14,0,-14,-28,32,30) * dv;
int dx = dx0.s0 + dx0.s1 + dx0.s2 + dx0.s3 + dx0.s4 + dx0.s5 + dx0.s6 + dx0.s7;
int dy = dy0.s0 + dy0.s1 + dy0.s2+ dy0.s3 + dy0.s4 + dy0.s5 + dy0.s6 + dy0.s7;
param2[wg1] = dx/dy;
}
another piece of code like the following:
__kernel void cal_grad_kernel_cpu(
__global uchar* img,
uint width,
uint height,
__global uint *param1,
__global uint * param2,
__global uchar* patch_img
)
{
int wg = get_global_id(0);
int region_offset = wg * width;
int patch_offset = wg * width*2;
int W = 80;
for (int i = 0; i < 2 * 72; i++)
{
int newsrc_pos = ( param1[wg]);
int newsrc_pos1 = ( param1[wg * 8 +1]);
// Is this calculation worth?, or just do a memory copy to 25x25.
int4 tmppix = ((int4)( img[region_offset + newsrc_pos + 1] - img[region_offset + newsrc_pos],
img[region_offset + newsrc_pos + W+1] - img[region_offset + newsrc_pos + W],
img[region_offset + newsrc_pos1 + 1] - img[region_offset + newsrc_pos1],
img[region_offset + newsrc_pos1 + W+1] - img[region_offset + newsrc_pos1 + W])
* (int4)(param2[wg],param2[wg+1], param2[wg+2], param2[wg+3]))>>8;
tmppix = (int4)( img[region_offset + newsrc_pos], img[region_offset + newsrc_pos + W],img[region_offset + newsrc_pos1], img[region_offset + newsrc_pos1 + W]) + (tmppix);
patch_img[patch_offset] =( unsigned char )( tmppix.x + ((( tmppix.y - tmppix.x)*param2[wg*2])>>8));
patch_img[patch_offset + 1] =( unsigned char )( tmppix.z + ((( tmppix.w - tmppix.z)*param2[wg*2 +2])>>8));
patch_offset += 2;
}
}