8 Replies Latest reply on May 14, 2011 7:24 PM by himanshu.gautam

    median filter issue

    mikewolf_gkd

      hi,

           I am learning opencl based on AMD platform. my vedio card is caicos. and I installed AMD stream sdk 2.4

      I fininshed a median filter codes, but I met a issue. cpu reslut and gpu result are different in severl position, about 40 pixel. and make me curious, their position are random.

      source code: http://code.google.com/p/imagefilter-opencl/downloads/list  FilterFrame4.rar,

      and if you want to run it, you need to freeimage lib and head files, it is included in Dist.rar files.

       

      my cpu codes:

      int gmedianFilter::cpu_median(unsigned char* inbuf, unsigned char* outbuf, int n)
      {
      int i, j, k, t1, t2;

      k = (n-1)/2;
      int k2 = n*n;

      unsigned char *buf1=0;
      unsigned char *buf2=0;
      unsigned char *buf3=0;
      unsigned char *buf4=0;

      buf1 = (unsigned char *)malloc(k2*sizeof(unsigned char));
      buf2 = (unsigned char *)malloc(k2*sizeof(unsigned char));
      buf3 = (unsigned char *)malloc(k2*sizeof(unsigned char));
      buf4 = (unsigned char *)malloc(k2*sizeof(unsigned char));

      //unsigned char buf1[100],buf2[100],buf3[100],buf4[100];

      for(j = 0; j < height; j++)
      {
      for(i = 0; i < width; i++)
      {
      if(i < k || j < k || i > width - k -1 || j > height - k -1)
      {
      outbuf[i*4 + j*width*4] = inbuf[i*4 + j*width*4];
      outbuf[i*4 + j*width*4 + 1] = inbuf[i*4 + j*width*4 + 1];
      outbuf[i*4 + j*width*4 + 2] = inbuf[i*4 + j*width*4 + 2];
      outbuf[i*4 + j*width*4 + 3] = inbuf[i*4 + j*width*4 + 3];
      continue;
      }

      int t = 0;
      for(t1 = j - k; t1 <= j + k; t1++)
      {
      for(t2 = i - k; t2 <= i + k; t2++)
      {
      buf1[t] = inbuf[t2*4 + t1*width*4];
      buf2[t] = inbuf[t2*4 + t1*width*4 + 1];
      buf3[t] = inbuf[t2*4 + t1*width*4 + 2];
      buf4[t] = inbuf[t2*4 + t1*width*4 + 3];
      t++;

      }
      }

       

      outbuf[i*4 + j*width*4] = MiddleValue(buf1, k2);
      outbuf[i*4 + j*width*4 + 1] = MiddleValue(buf2, k2);
      outbuf[i*4 + j*width*4 + 2] = MiddleValue(buf3, k2);
      outbuf[i*4 + j*width*4 + 3] = MiddleValue(buf4, k2);
      if(i==254&&j==3)
      {
      printf("cpu final color:%d, %d,%d,%d,%d,%d\n",i,j, outbuf[i*4 + j*width*4], outbuf[i*4 + j*width*4 + 1], outbuf[i*4 + j*width*4 + 2],outbuf[i*4 + j*width*4 + 3]);
      }
      }
      }


      if(buf1)
      free(buf1);
      if(buf2)
      free(buf2);
      if(buf3)
      free(buf3);
      if(buf4)
      free(buf4);
      return 0;

      }

      //bubble sort to get median value
      unsigned char gmedianFilter::MiddleValue(unsigned char Array[],int n)
      {
      int i,j,t;
      for(i = 0; i < n-1; i++)
      {
      for(j = 0; j {
      if(Array[j]>Array[j+1])
      {
      t = Array[j+1];
      Array[j+1] = Array[j];
      Array[j] = t;
      }
      }
      }
      return(Array[(n-1)/2]);
      }

       

      kernel codes:

      #pragma OPENCL EXTENSION cl_amd_printf : enable
      uchar4 sort(__global uchar4* Array, uint N)
      {

      int i,j;
      uchar4 t;
      for(i = 0; i < N-1; i++)
      {
      for(j = 0; j {
      t = Array[j+1];
      if(Array[j].x>Array[j+1].x)
      {
      Array[j+1].x = Array[j].x;
      Array[j].x = t.x;
      }
      if(Array[j].y>Array[j+1].y)
      {
      Array[j+1].y = Array[j].y;
      Array[j].y = t.y;
      }
      if(Array[j].z>Array[j+1].z)
      {
      Array[j+1].z = Array[j].z;
      Array[j].z = t.z;
      }
      if(Array[j].w>Array[j+1].w)
      {
      Array[j+1].w = Array[j].w;
      Array[j].w = t.w;
      }
      }
      }
      return Array[(N-1)/2];
      }
      uchar4 sortlocal(__local uchar4* Array, uint N)
      {

      int i,j;
      uchar4 t;
      for(i = 0; i < N-1; i++)
      {
      for(j = 0; j {
      t = Array[j+1];
      if(Array[j].x>Array[j+1].x)
      {
      Array[j+1].x = Array[j].x;
      Array[j].x = t.x;
      }
      if(Array[j].y>Array[j+1].y)
      {
      Array[j+1].y = Array[j].y;
      Array[j].y = t.y;
      }
      if(Array[j].z>Array[j+1].z)
      {
      Array[j+1].z = Array[j].z;
      Array[j].z = t.z;
      }
      if(Array[j].w>Array[j+1].w)
      {
      Array[j+1].w = Array[j].w;
      Array[j].w = t.w;
      }
      }
      }
      return Array[(N-1)/2];
      }
      //compared with former function, kenerl exce time is 6 times
      __kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage,__global uchar4* tempbuf, uint N)
      {
      int x = get_global_id(0);
      int y = get_global_id(1);
      int width = get_global_size(0);
      int height = get_global_size(1);

      int k = (N-1)/2;
      int n = N*N; //n*n

      if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
      {
      outputImage[x + y * width] = inputImage[x + y * width];
      return;
      }

      uchar4 finalcolor = (uchar4)(0);

      int i,j;
      int t = 0;
      for(j = y - k; j <= y + k; j++)
      {
      for(i = x - k; i <= x + k; i++)
      {
      tempbuf[(x+y*width)*n+t] = inputImage[i + j * width];
      t++;
      }
      }

      finalcolor = sort(tempbuf+(x+y*width)*n, n);

      //if(x==254 && y==3)
      // printf("final color:%d, %d,%d,%d,%d,%d\n", x,y,finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
      outputImage[x + y * width] = finalcolor;

      }

      __kernel void filterlocal(__global uchar4* inputImage, __global uchar4* outputImage,__local uchar4* tempbuf, uint N)
      {
      int x = get_global_id(0);
      int y = get_global_id(1);
      int width = get_global_size(0);
      int height = get_global_size(1);
      int xid = get_local_id(0);
      int yid = get_local_id(1);
      int xwidth = get_local_size(0);
      int ywidth = get_local_size(1);

      int k = (N-1)/2;
      int n = N*N; //n*n

      if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
      {
      outputImage[x + y * width] = inputImage[x + y * width];
      return;
      }

      uchar4 finalcolor = (uchar4)(0);

      int i,j;
      int t = 0;
      int ad = (yid*xwidth+xid)*n;
      for(j = y - k; j <= y + k; j++)
      {
      for(i = x - k; i <= x + k; i++)
      {
      tempbuf[ad+t] = inputImage[i + j * width];
      t++;
      }
      }

      finalcolor = sortlocal(tempbuf+ad, n);
      //if(x==5 && y==5)
      // printf("final color:%d,%d,%d,%d\n", finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
      outputImage[x + y * width] = finalcolor;

      }

       

      and calling kernel codes:

      // build the program from the source in the file
      filter = clCreateKernel( program, "filter", NULL );

      t.Reset();
      t.Start();

      //Create input, output and debug buffers.
      src_buf = clCreateBuffer(context,
      CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
      width*height* 4 * sizeof(cl_uchar),
      buf,
      NULL );
      dst_buf = clCreateBuffer( context,
      CL_MEM_READ_WRITE,
      width*height* 4 * sizeof(cl_uchar),
      NULL, NULL );

      tmp_buf = clCreateBuffer(context,
      CL_MEM_READ_WRITE,
      width*height* 4 * sizeof(cl_uchar)*N*N,
      NULL,
      NULL );


      t.Stop();
      printf("copy from host to device :%.6f ms \n ", t.GetElapsedTime() *1000);

      clSetKernelArg(filter, 0, sizeof(void *), (void*) &src_buf);
      clSetKernelArg(filter, 1, sizeof(void *), (void*) &dst_buf);
      clSetKernelArg(filter, 2, sizeof(void *), (void*) &tmp_buf);
      //clSetKernelArg(filter, 2,32768,NULL);
      //clSetKernelArg(filter, 2,width*height* 4 * sizeof(cl_uchar)*N*N,NULL);
      clSetKernelArg(filter, 3, sizeof(cl_uint), &N);