hi,
I am learning opencl based on AMD platform. my vedio card is caicos. and I installed AMD stream sdk 2.4
I fininshed a median filter codes, but I met a issue. cpu reslut and gpu result are different in severl position, about 40 pixel. and make me curious, their position are random.
source code: http://code.google.com/p/imagefilter-opencl/downloads/list FilterFrame4.rar,
and if you want to run it, you need to freeimage lib and head files, it is included in Dist.rar files.
my cpu codes:
int gmedianFilter::cpu_median(unsigned char* inbuf, unsigned char* outbuf, int n)
{
int i, j, k, t1, t2;
k = (n-1)/2;
int k2 = n*n;
unsigned char *buf1=0;
unsigned char *buf2=0;
unsigned char *buf3=0;
unsigned char *buf4=0;
buf1 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf2 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf3 = (unsigned char *)malloc(k2*sizeof(unsigned char));
buf4 = (unsigned char *)malloc(k2*sizeof(unsigned char));
//unsigned char buf1[100],buf2[100],buf3[100],buf4[100];
for(j = 0; j < height; j++)
{
for(i = 0; i < width; i++)
{
if(i < k || j < k || i > width - k -1 || j > height - k -1)
{
outbuf[i*4 + j*width*4] = inbuf[i*4 + j*width*4];
outbuf[i*4 + j*width*4 + 1] = inbuf[i*4 + j*width*4 + 1];
outbuf[i*4 + j*width*4 + 2] = inbuf[i*4 + j*width*4 + 2];
outbuf[i*4 + j*width*4 + 3] = inbuf[i*4 + j*width*4 + 3];
continue;
}
int t = 0;
for(t1 = j - k; t1 <= j + k; t1++)
{
for(t2 = i - k; t2 <= i + k; t2++)
{
buf1
buf2
buf3
buf4
t++;
}
}
outbuf[i*4 + j*width*4] = MiddleValue(buf1, k2);
outbuf[i*4 + j*width*4 + 1] = MiddleValue(buf2, k2);
outbuf[i*4 + j*width*4 + 2] = MiddleValue(buf3, k2);
outbuf[i*4 + j*width*4 + 3] = MiddleValue(buf4, k2);
if(i==254&&j==3)
{
printf("cpu final color:%d, %d,%d,%d,%d,%d\n",i,j, outbuf[i*4 + j*width*4], outbuf[i*4 + j*width*4 + 1], outbuf[i*4 + j*width*4 + 2],outbuf[i*4 + j*width*4 + 3]);
}
}
}
if(buf1)
free(buf1);
if(buf2)
free(buf2);
if(buf3)
free(buf3);
if(buf4)
free(buf4);
return 0;
}
//bubble sort to get median value
unsigned char gmedianFilter::MiddleValue(unsigned char Array[],int n)
{
int i,j,t;
for(i = 0; i < n-1; i++)
{
for(j = 0; j {
if(Array
{
t = Array[j+1];
Array[j+1] = Array
Array
}
}
}
return(Array[(n-1)/2]);
}
kernel codes:
#pragma OPENCL EXTENSION cl_amd_printf : enable
uchar4 sort(__global uchar4* Array, uint N)
{
int i,j;
uchar4 t;
for(i = 0; i < N-1; i++)
{
for(j = 0; j {
t = Array[j+1];
if(Array
{
Array[j+1].x = Array
Array
}
if(Array
{
Array[j+1].y = Array
Array
}
if(Array
{
Array[j+1].z = Array
Array
}
if(Array
{
Array[j+1].w = Array
Array
}
}
}
return Array[(N-1)/2];
}
uchar4 sortlocal(__local uchar4* Array, uint N)
{
int i,j;
uchar4 t;
for(i = 0; i < N-1; i++)
{
for(j = 0; j {
t = Array[j+1];
if(Array
{
Array[j+1].x = Array
Array
}
if(Array
{
Array[j+1].y = Array
Array
}
if(Array
{
Array[j+1].z = Array
Array
}
if(Array
{
Array[j+1].w = Array
Array
}
}
}
return Array[(N-1)/2];
}
//compared with former function, kenerl exce time is 6 times
__kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage,__global uchar4* tempbuf, uint N)
{
int x = get_global_id(0);
int y = get_global_id(1);
int width = get_global_size(0);
int height = get_global_size(1);
int k = (N-1)/2;
int n = N*N; //n*n
if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{
outputImage[x + y * width] = inputImage[x + y * width];
return;
}
uchar4 finalcolor = (uchar4)(0);
int i,j;
int t = 0;
for(j = y - k; j <= y + k; j++)
{
for(i = x - k; i <= x + k; i++)
{
tempbuf[(x+y*width)*n+t] = inputImage[i + j * width];
t++;
}
}
finalcolor = sort(tempbuf+(x+y*width)*n, n);
//if(x==254 && y==3)
// printf("final color:%d, %d,%d,%d,%d,%d\n", x,y,finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
outputImage[x + y * width] = finalcolor;
}
__kernel void filterlocal(__global uchar4* inputImage, __global uchar4* outputImage,__local uchar4* tempbuf, uint N)
{
int x = get_global_id(0);
int y = get_global_id(1);
int width = get_global_size(0);
int height = get_global_size(1);
int xid = get_local_id(0);
int yid = get_local_id(1);
int xwidth = get_local_size(0);
int ywidth = get_local_size(1);
int k = (N-1)/2;
int n = N*N; //n*n
if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{
outputImage[x + y * width] = inputImage[x + y * width];
return;
}
uchar4 finalcolor = (uchar4)(0);
int i,j;
int t = 0;
int ad = (yid*xwidth+xid)*n;
for(j = y - k; j <= y + k; j++)
{
for(i = x - k; i <= x + k; i++)
{
tempbuf[ad+t] = inputImage[i + j * width];
t++;
}
}
finalcolor = sortlocal(tempbuf+ad, n);
//if(x==5 && y==5)
// printf("final color:%d,%d,%d,%d\n", finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
outputImage[x + y * width] = finalcolor;
}
and calling kernel codes:
// build the program from the source in the file
filter = clCreateKernel( program, "filter", NULL );
t.Reset();
t.Start();
//Create input, output and debug buffers.
src_buf = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width*height* 4 * sizeof(cl_uchar),
buf,
NULL );
dst_buf = clCreateBuffer( context,
CL_MEM_READ_WRITE,
width*height* 4 * sizeof(cl_uchar),
NULL, NULL );
tmp_buf = clCreateBuffer(context,
CL_MEM_READ_WRITE,
width*height* 4 * sizeof(cl_uchar)*N*N,
NULL,
NULL );
t.Stop();
printf("copy from host to device :%.6f ms \n ", t.GetElapsedTime() *1000);
clSetKernelArg(filter, 0, sizeof(void *), (void*) &src_buf);
clSetKernelArg(filter, 1, sizeof(void *), (void*) &dst_buf);
clSetKernelArg(filter, 2, sizeof(void *), (void*) &tmp_buf);
//clSetKernelArg(filter, 2,32768,NULL);
//clSetKernelArg(filter, 2,width*height* 4 * sizeof(cl_uchar)*N*N,NULL);
clSetKernelArg(filter, 3, sizeof(cl_uint), &N);
hi, Micah
every thread sort in different local memory eara, such as, first thread, 0, second thread, 36, third thread, 72..., so I had no sync local memory.
actually, kernel function filterlocal result is right, it's same as cpu result.
but for global memory, it occurs some curious issues, my video card is caicos, I tried it in a old RV700 card(4670), it also had same issue. but made me curious that I ran it in my note book, result is right, my notebook is 5730 card(redwood).
Can you provide your system details
CPU
Driver
OS
If the code works on CPU but not on GPU, its generally a issue with coherency. Check out the places you need to have barriers. You can also use printf to see where things go wrong when running on GPU.
hi, my cpu is AMD Athlon(tm) II X3 425
card: HD6450(caicos)
driver: 8.850.0.0
driver data:2011-2-26
Can you mention Operating system also?
Have you tried doing some debugging using the printf to see where the data corruption might be happening?
thanks, my os is xp, actually, I have tried to printf data,
but when adding printf, result is right, but remove it, mistake occurs again.
maybe, it's stream sdk or driver issue.
well printf cause som sort of serialization so you should look at som race conditions. or there is indeed some bug in sdk.
I am not sure where the problem is. But this was a very similar issue reported recently: Transpose kernel fails
Anyways i will try to check your code for the issue and add that to the same problem. Thanks for reporting it.