Archives Discussions

mikewolf_gkd · ‎05-10-2011

hi,

I am learning opencl based on AMD platform. my vedio card is caicos. and I installed AMD stream sdk 2.4

I fininshed a median filter codes, but I met a issue. cpu reslut and gpu result are different in severl position, about 40 pixel. and make me curious, their position are random.

source code: http://code.google.com/p/imagefilter-opencl/downloads/list FilterFrame4.rar,

and if you want to run it, you need to freeimage lib and head files, it is included in Dist.rar files.

my cpu codes:

int gmedianFilter::cpu_median(unsigned char* inbuf, unsigned char* outbuf, int n)
{
int i, j, k, t1, t2;

k = (n-1)/2;
int k2 = n*n;

unsigned char *buf1=0;
unsigned char *buf2=0;
unsigned char *buf3=0;
unsigned char *buf4=0;

buf1 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf2 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf3 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf4 = (unsigned char *)malloc(k2*sizeof(unsigned char));

//unsigned char buf1[100],buf2[100],buf3[100],buf4[100];

for(j = 0; j < height; j++)
{
for(i = 0; i < width; i++)
{
if(i < k || j < k || i > width - k -1 || j > height - k -1)
{
outbuf[i*4 + j*width*4] = inbuf[i*4 + j*width*4];
outbuf[i*4 + j*width*4 + 1] = inbuf[i*4 + j*width*4 + 1];
outbuf[i*4 + j*width*4 + 2] = inbuf[i*4 + j*width*4 + 2];
outbuf[i*4 + j*width*4 + 3] = inbuf[i*4 + j*width*4 + 3];
continue;
}

int t = 0;
for(t1 = j - k; t1 <= j + k; t1++)
{
for(t2 = i - k; t2 <= i + k; t2++)
{
buf1 = inbuf[t2*4 + t1*width*4];
buf2 = inbuf[t2*4 + t1*width*4 + 1];
buf3 = inbuf[t2*4 + t1*width*4 + 2];
buf4 = inbuf[t2*4 + t1*width*4 + 3];
t++;

}
}

outbuf[i*4 + j*width*4] = MiddleValue(buf1, k2);
outbuf[i*4 + j*width*4 + 1] = MiddleValue(buf2, k2);
outbuf[i*4 + j*width*4 + 2] = MiddleValue(buf3, k2);
outbuf[i*4 + j*width*4 + 3] = MiddleValue(buf4, k2);
if(i==254&&j==3)
{
printf("cpu final color:%d, %d,%d,%d,%d,%d\n",i,j, outbuf[i*4 + j*width*4], outbuf[i*4 + j*width*4 + 1], outbuf[i*4 + j*width*4 + 2],outbuf[i*4 + j*width*4 + 3]);
}
}
}

if(buf1)
free(buf1);
if(buf2)
free(buf2);
if(buf3)
free(buf3);
if(buf4)
free(buf4);
return 0;

}

//bubble sort to get median value
unsigned char gmedianFilter::MiddleValue(unsigned char Array[],int n)
{
int i,j,t;
for(i = 0; i < n-1; i++)
{
for(j = 0; j {
if(Array>Array[j+1])
{
t = Array[j+1];
Array[j+1] = Array;
Array = t;
}
}
}
return(Array[(n-1)/2]);
}

kernel codes:

#pragma OPENCL EXTENSION cl_amd_printf : enable
uchar4 sort(__global uchar4* Array, uint N)
{

int i,j;
uchar4 t;
for(i = 0; i < N-1; i++)
{
for(j = 0; j {
t = Array[j+1];
if(Array.x>Array[j+1].x)
{
Array[j+1].x = Array.x;
Array.x = t.x;
}
if(Array.y>Array[j+1].y)
{
Array[j+1].y = Array.y;
Array.y = t.y;
}
if(Array.z>Array[j+1].z)
{
Array[j+1].z = Array.z;
Array.z = t.z;
}
if(Array.w>Array[j+1].w)
{
Array[j+1].w = Array.w;
Array.w = t.w;
}
}
}
return Array[(N-1)/2];
}
uchar4 sortlocal(__local uchar4* Array, uint N)
{

int i,j;
uchar4 t;
for(i = 0; i < N-1; i++)
{
for(j = 0; j {
t = Array[j+1];
if(Array.x>Array[j+1].x)
{
Array[j+1].x = Array.x;
Array.x = t.x;
}
if(Array.y>Array[j+1].y)
{
Array[j+1].y = Array.y;
Array.y = t.y;
}
if(Array.z>Array[j+1].z)
{
Array[j+1].z = Array.z;
Array.z = t.z;
}
if(Array.w>Array[j+1].w)
{
Array[j+1].w = Array.w;
Array.w = t.w;
}
}
}
return Array[(N-1)/2];
}
//compared with former function, kenerl exce time is 6 times
__kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage,__global uchar4* tempbuf, uint N)
{
int x = get_global_id(0);
int y = get_global_id(1);
int width = get_global_size(0);
int height = get_global_size(1);

int k = (N-1)/2;
int n = N*N; //n*n

if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{
outputImage[x + y * width] = inputImage[x + y * width];
return;
}

uchar4 finalcolor = (uchar4)(0);

int i,j;
int t = 0;
for(j = y - k; j <= y + k; j++)
{
for(i = x - k; i <= x + k; i++)
{
tempbuf[(x+y*width)*n+t] = inputImage[i + j * width];
t++;
}
}

finalcolor = sort(tempbuf+(x+y*width)*n, n);

//if(x==254 && y==3)
// printf("final color:%d, %d,%d,%d,%d,%d\n", x,y,finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
outputImage[x + y * width] = finalcolor;

}

__kernel void filterlocal(__global uchar4* inputImage, __global uchar4* outputImage,__local uchar4* tempbuf, uint N)
{
int x = get_global_id(0);
int y = get_global_id(1);
int width = get_global_size(0);
int height = get_global_size(1);
int xid = get_local_id(0);
int yid = get_local_id(1);
int xwidth = get_local_size(0);
int ywidth = get_local_size(1);

int k = (N-1)/2;
int n = N*N; //n*n

if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{
outputImage[x + y * width] = inputImage[x + y * width];
return;
}

uchar4 finalcolor = (uchar4)(0);

int i,j;
int t = 0;
int ad = (yid*xwidth+xid)*n;
for(j = y - k; j <= y + k; j++)
{
for(i = x - k; i <= x + k; i++)
{
tempbuf[ad+t] = inputImage[i + j * width];
t++;
}
}

finalcolor = sortlocal(tempbuf+ad, n);
//if(x==5 && y==5)
// printf("final color:%d,%d,%d,%d\n", finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
outputImage[x + y * width] = finalcolor;

}

and calling kernel codes:

// build the program from the source in the file
filter = clCreateKernel( program, "filter", NULL );

t.Reset();
t.Start();

//Create input, output and debug buffers.
src_buf = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width*height* 4 * sizeof(cl_uchar),
buf,
NULL );
dst_buf = clCreateBuffer( context,
CL_MEM_READ_WRITE,
width*height* 4 * sizeof(cl_uchar),
NULL, NULL );

tmp_buf = clCreateBuffer(context,
CL_MEM_READ_WRITE,
width*height* 4 * sizeof(cl_uchar)*N*N,
NULL,
NULL );

t.Stop();
printf("copy from host to device :%.6f ms \n ", t.GetElapsedTime() *1000);

clSetKernelArg(filter, 0, sizeof(void *), (void*) &src_buf);
clSetKernelArg(filter, 1, sizeof(void *), (void*) &dst_buf);
clSetKernelArg(filter, 2, sizeof(void *), (void*) &tmp_buf);
//clSetKernelArg(filter, 2,32768,NULL);
//clSetKernelArg(filter, 2,width*height* 4 * sizeof(cl_uchar)*N*N,NULL);
clSetKernelArg(filter, 3, sizeof(cl_uint), &N);

MicahVillmow · ‎05-10-2011

mike,
How are you synchronizing between threads in a work-group when doing the sort in local memory? I don't see any barriers in your code. You most likely have a race condition on the GPU that doesn't appear on the CPU.

mikewolf_gkd · ‎05-11-2011

hi, Micah

every thread sort in different local memory eara, such as, first thread, 0, second thread, 36, third thread, 72..., so I had no sync local memory.

actually, kernel function filterlocal result is right, it's same as cpu result.

but for global memory, it occurs some curious issues, my video card is caicos, I tried it in a old RV700 card(4670), it also had same issue. but made me curious that I ran it in my note book, result is right, my notebook is 5730 card(redwood).

himanshu_gautam · ‎05-12-2011

Can you provide your system details

CPU

Driver

OS

If the code works on CPU but not on GPU, its generally a issue with coherency. Check out the places you need to have barriers. You can also use printf to see where things go wrong when running on GPU.

mikewolf_gkd · ‎05-12-2011

hi, my cpu is AMD Athlon(tm) II X3 425

card: HD6450(caicos)

driver: 8.850.0.0

driver data:2011-2-26

himanshu_gautam · ‎05-13-2011

Can you mention Operating system also?

Have you tried doing some debugging using the printf to see where the data corruption might be happening?

mikewolf_gkd · ‎05-13-2011

thanks, my os is xp, actually, I have tried to printf data,

but when adding printf, result is right, but remove it, mistake occurs again.

maybe, it's stream sdk or driver issue.

nou · ‎05-14-2011

well printf cause som sort of serialization so you should look at som race conditions. or there is indeed some bug in sdk.

himanshu_gautam · ‎05-14-2011

I am not sure where the problem is. But this was a very similar issue reported recently: Transpose kernel fails

Anyways i will try to check your code for the issue and add that to the same problem. Thanks for reporting it.

Archives Discussions

median filter issue