I wrote two OpenCL kernels that calculate the box filter: one using local memory and the other one without the local memory. The performance of the kernel that does not use the local memory is way better than the one that uses local memory. The one with the local memory takes 30ms and the one without takes 19ms. I thought it should be the other way around.
#define FILTER_RADIUS (3)
#define FILTER_SIZE (2*FILTER_RADIUS + 1)
#define CACHE_PITCH (2*FILTER_RADIUS + 16)
//WITH LOCAL mem
__kernel void boxLmem(__global const uchar *in,
int pitch,
__global uchar *out)
{
int x = get_global_id(0);
int y = get_global_id(1);
__local uchar cache[CACHE_PITCH*CACHE_PITCH];
int lx = get_local_id(0);
int ly = get_local_id(1);
lx += FILTER_RADIUS;
ly += FILTER_RADIUS;
if(lx == FILTER_RADIUS)
{
if(ly == FILTER_RADIUS)
{
for(int row = -FILTER_RADIUS; row < 0; row++)
{
for(int col = -FILTER_RADIUS; col <= FILTER_RADIUS + 16; ++col)
{
cache[lx + col + CACHE_PITCH*(ly + row)] = in[x + col + pitch*(y + row)];
}
}
}
else if(ly == 15 + FILTER_RADIUS)
{
for(int row = 1; row <= FILTER_RADIUS; row++)
{
for(int col = -FILTER_RADIUS; col <= FILTER_RADIUS + 16; ++col)
{
cache[lx + col + CACHE_PITCH*(ly + row)] = in[x + col + pitch*(y + row)];
}
}
}
for(int col = -FILTER_RADIUS; col < 0; ++col)
{
cache[lx + col + CACHE_PITCH*(ly)] = in[x + col + pitch*(y)];
}
}
else if(lx == 15 + FILTER_RADIUS)
{
for(int col = 1; col <= FILTER_RADIUS; ++col)
{
cache[lx + col + CACHE_PITCH*(ly)] = in[x + col + pitch*(y)];
}
}
cache[lx + CACHE_PITCH*ly] = in[x + pitch*y];
barrier(CLK_LOCAL_MEM_FENCE);
float fout = 0.f;
for(int j = -FILTER_RADIUS; j <= FILTER_RADIUS; ++j)
{
for(int i = -FILTER_RADIUS; i <= FILTER_RADIUS; ++i)
{
fout += convert_float(cache[(lx + i) + (ly + j)*CACHE_PITCH])* (1.f/(FILTER_SIZE*FILTER_SIZE));
}
}
out[x + pitch*y] = convert_uchar_sat(fout);
}
////////////////////////WITHHOUT LOCAL MEMORY///////////////////
#define FILTER_RADIUS (3)
__kernel void box(__global const uchar *in,
int pitch,
__global uchar *out)
{
int x = get_global_id(0);
int y = get_global_id(1);
int filterSize = 2*FILTER_RADIUS + 1;
float fout = 0.f;
for(int j = -FILTER_RADIUS; j <= FILTER_RADIUS; ++j)
{
for(int i = -FILTER_RADIUS; i <= FILTER_RADIUS; ++i)
{
fout += convert_float(in[(x + i) + (y + j)*pitch])/(filterSize*filterSize);
}
}
out[x + pitch*y] = convert_uchar_sat(fout);
}