mikewolf_gkd

median filter issue

Discussion created by mikewolf_gkd on May 10, 2011
Latest reply on May 14, 2011 by himanshu.gautam

hi,

     I am learning opencl based on AMD platform. my vedio card is caicos. and I installed AMD stream sdk 2.4

I fininshed a median filter codes, but I met a issue. cpu reslut and gpu result are different in severl position, about 40 pixel. and make me curious, their position are random.

source code: http://code.google.com/p/imagefilter-opencl/downloads/list  FilterFrame4.rar,

and if you want to run it, you need to freeimage lib and head files, it is included in Dist.rar files.

 

my cpu codes:

int gmedianFilter::cpu_median(unsigned char* inbuf, unsigned char* outbuf, int n)
{
int i, j, k, t1, t2;

k = (n-1)/2;
int k2 = n*n;

unsigned char *buf1=0;
unsigned char *buf2=0;
unsigned char *buf3=0;
unsigned char *buf4=0;

buf1 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf2 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf3 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf4 = (unsigned char *)malloc(k2*sizeof(unsigned char));

//unsigned char buf1[100],buf2[100],buf3[100],buf4[100];

for(j = 0; j < height; j++)
{
for(i = 0; i < width; i++)
{
if(i < k || j < k || i > width - k -1 || j > height - k -1)
{
outbuf[i*4 + j*width*4] = inbuf[i*4 + j*width*4];
outbuf[i*4 + j*width*4 + 1] = inbuf[i*4 + j*width*4 + 1];
outbuf[i*4 + j*width*4 + 2] = inbuf[i*4 + j*width*4 + 2];
outbuf[i*4 + j*width*4 + 3] = inbuf[i*4 + j*width*4 + 3];
continue;
}

int t = 0;
for(t1 = j - k; t1 <= j + k; t1++)
{
for(t2 = i - k; t2 <= i + k; t2++)
{
buf1[t] = inbuf[t2*4 + t1*width*4];
buf2[t] = inbuf[t2*4 + t1*width*4 + 1];
buf3[t] = inbuf[t2*4 + t1*width*4 + 2];
buf4[t] = inbuf[t2*4 + t1*width*4 + 3];
t++;

}
}

 

outbuf[i*4 + j*width*4] = MiddleValue(buf1, k2);
outbuf[i*4 + j*width*4 + 1] = MiddleValue(buf2, k2);
outbuf[i*4 + j*width*4 + 2] = MiddleValue(buf3, k2);
outbuf[i*4 + j*width*4 + 3] = MiddleValue(buf4, k2);
if(i==254&&j==3)
{
printf("cpu final color:%d, %d,%d,%d,%d,%d\n",i,j, outbuf[i*4 + j*width*4], outbuf[i*4 + j*width*4 + 1], outbuf[i*4 + j*width*4 + 2],outbuf[i*4 + j*width*4 + 3]);
}
}
}


if(buf1)
free(buf1);
if(buf2)
free(buf2);
if(buf3)
free(buf3);
if(buf4)
free(buf4);
return 0;

}

//bubble sort to get median value
unsigned char gmedianFilter::MiddleValue(unsigned char Array[],int n)
{
int i,j,t;
for(i = 0; i < n-1; i++)
{
for(j = 0; j {
if(Array[j]>Array[j+1])
{
t = Array[j+1];
Array[j+1] = Array[j];
Array[j] = t;
}
}
}
return(Array[(n-1)/2]);
}

 

kernel codes:

#pragma OPENCL EXTENSION cl_amd_printf : enable
uchar4 sort(__global uchar4* Array, uint N)
{

int i,j;
uchar4 t;
for(i = 0; i < N-1; i++)
{
for(j = 0; j {
t = Array[j+1];
if(Array[j].x>Array[j+1].x)
{
Array[j+1].x = Array[j].x;
Array[j].x = t.x;
}
if(Array[j].y>Array[j+1].y)
{
Array[j+1].y = Array[j].y;
Array[j].y = t.y;
}
if(Array[j].z>Array[j+1].z)
{
Array[j+1].z = Array[j].z;
Array[j].z = t.z;
}
if(Array[j].w>Array[j+1].w)
{
Array[j+1].w = Array[j].w;
Array[j].w = t.w;
}
}
}
return Array[(N-1)/2];
}
uchar4 sortlocal(__local uchar4* Array, uint N)
{

int i,j;
uchar4 t;
for(i = 0; i < N-1; i++)
{
for(j = 0; j {
t = Array[j+1];
if(Array[j].x>Array[j+1].x)
{
Array[j+1].x = Array[j].x;
Array[j].x = t.x;
}
if(Array[j].y>Array[j+1].y)
{
Array[j+1].y = Array[j].y;
Array[j].y = t.y;
}
if(Array[j].z>Array[j+1].z)
{
Array[j+1].z = Array[j].z;
Array[j].z = t.z;
}
if(Array[j].w>Array[j+1].w)
{
Array[j+1].w = Array[j].w;
Array[j].w = t.w;
}
}
}
return Array[(N-1)/2];
}
//compared with former function, kenerl exce time is 6 times
__kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage,__global uchar4* tempbuf, uint N)
{
int x = get_global_id(0);
int y = get_global_id(1);
int width = get_global_size(0);
int height = get_global_size(1);

int k = (N-1)/2;
int n = N*N; //n*n

if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{
outputImage[x + y * width] = inputImage[x + y * width];
return;
}

uchar4 finalcolor = (uchar4)(0);

int i,j;
int t = 0;
for(j = y - k; j <= y + k; j++)
{
for(i = x - k; i <= x + k; i++)
{
tempbuf[(x+y*width)*n+t] = inputImage[i + j * width];
t++;
}
}

finalcolor = sort(tempbuf+(x+y*width)*n, n);

//if(x==254 && y==3)
// printf("final color:%d, %d,%d,%d,%d,%d\n", x,y,finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
outputImage[x + y * width] = finalcolor;

}

__kernel void filterlocal(__global uchar4* inputImage, __global uchar4* outputImage,__local uchar4* tempbuf, uint N)
{
int x = get_global_id(0);
int y = get_global_id(1);
int width = get_global_size(0);
int height = get_global_size(1);
int xid = get_local_id(0);
int yid = get_local_id(1);
int xwidth = get_local_size(0);
int ywidth = get_local_size(1);

int k = (N-1)/2;
int n = N*N; //n*n

if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{
outputImage[x + y * width] = inputImage[x + y * width];
return;
}

uchar4 finalcolor = (uchar4)(0);

int i,j;
int t = 0;
int ad = (yid*xwidth+xid)*n;
for(j = y - k; j <= y + k; j++)
{
for(i = x - k; i <= x + k; i++)
{
tempbuf[ad+t] = inputImage[i + j * width];
t++;
}
}

finalcolor = sortlocal(tempbuf+ad, n);
//if(x==5 && y==5)
// printf("final color:%d,%d,%d,%d\n", finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
outputImage[x + y * width] = finalcolor;

}

 

and calling kernel codes:

// build the program from the source in the file
filter = clCreateKernel( program, "filter", NULL );

t.Reset();
t.Start();

//Create input, output and debug buffers.
src_buf = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width*height* 4 * sizeof(cl_uchar),
buf,
NULL );
dst_buf = clCreateBuffer( context,
CL_MEM_READ_WRITE,
width*height* 4 * sizeof(cl_uchar),
NULL, NULL );

tmp_buf = clCreateBuffer(context,
CL_MEM_READ_WRITE,
width*height* 4 * sizeof(cl_uchar)*N*N,
NULL,
NULL );


t.Stop();
printf("copy from host to device :%.6f ms \n ", t.GetElapsedTime() *1000);

clSetKernelArg(filter, 0, sizeof(void *), (void*) &src_buf);
clSetKernelArg(filter, 1, sizeof(void *), (void*) &dst_buf);
clSetKernelArg(filter, 2, sizeof(void *), (void*) &tmp_buf);
//clSetKernelArg(filter, 2,32768,NULL);
//clSetKernelArg(filter, 2,width*height* 4 * sizeof(cl_uchar)*N*N,NULL);
clSetKernelArg(filter, 3, sizeof(cl_uint), &N);

 

Outcomes