5 Replies Latest reply on Jul 16, 2009 4:06 PM by gaurav.garg

    Is there anyway to optimize this kernel?

    riza.guntur
      [ALGORITHMICALLY ERROR,WILL BE FIXED]

      I have a program code like the following:

      kernel void miu(int2 size, float4 max<>, float4 min<>, float4 median<>, float4 vec_ref_max[][], float4 vec_ref_min[][], float4 vec_ref_median[][], out float4 miu<>)
      {
          int2 index = instance().xy;
          int index_x = index.x;
          int index_y = index.y;
          float4 output;
          float4 tp1,tp2;
          float4 posisi_x;
          
          //size is the size of vec_ref_
          for(int i = 0; i < size.y; i++)
          {
              if (median.x == vec_ref_median[index_x].x) // berarti posisi sudah sama persih
              {
                  output.x = 1.0f;
              }
              else if (median.x > vec_ref_median[index_x]
      .x) // rumus a
              {
                  posisi_x.x = (median.x - min.x) * vec_ref_max[index_x].x;
                  posisi_x.x = posisi_x.x / (vec_ref_max.x[index_x]
      - vec_ref_median[index_x].x);
                  posisi_x.x = posisi_x.x + min.x;
                  posisi_x.x = posisi_x.x / ( 1 + median.x - min.x)/(vec_ref_max[index_x]
      .x - vec_ref_median[index_x].x));

                  tp1.x = (max.x -  posisi_x.x)/(max.x - median.x);
                  tp2.x = (vec_ref_max[index_x]
      .x -  posisi_x.x)/(vec_ref_max[index_x].x - vec_ref_median[index_x].x);

              }
              else // if (median.x < vec_ref_median.x) // rumus b
              {
                  posisi_x.x = (vec_ref_median[index_x].x - vec_ref_min[index_x].x) * max.x;
                  posisi_x.x = posisi_x.x / (max.x - median.x);
                  posisi_x.x = posisi_x.x + vec_ref_min[index_x].x;
                  posisi_x.x = posisi_x.x / ( 1 + (vec_ref_median[index_x]
      .x - vec_ref_min[index_x].x)/(max.x - median.x));

                  tp1.x = (max.x -  posisi_x.x)/(max.x - median.x);
                  tp2.x = (vec_ref_median[index_x]
      .x -  posisi_x.x)/(vec_ref_median[index_x].x - vec_ref_min[index_x].x);

              }
             
              if ((tp1.x > 0) && (tp1.x <= 1))
              {
                  output.x = tp1.x;
              }
              else if ((tp2.x>0) && (tp2.x<=1))
              {
                  output.x = tp2.x;
              }
              else
              {
                  output.x = 0;
              }
             
             
             
              if (median.y == vec_ref_median[index_x].y) // berarti posisi sudah sama persih
              {
                  output.y = 1.0f;
              }
              else if (median.y > vec_ref_median[index_x]
      .y) // rumus a
              {
                  posisi_x.y = (median.y - min.y) * vec_ref_max[index_x].y;
                  posisi_x.y = posisi_x.y / (vec_ref_max.y[index_x]
      - vec_ref_median[index_x].y);
                  posisi_x.y = posisi_x.y + min.y;
                  posisi_x.y = posisi_x.y / ( 1 + median.y - min.y)/(vec_ref_max[index_x]
      .y - vec_ref_median[index_x].y));

                  tp1.y = (max.y -  posisi_x.y)/(max.y - median.y);
                  tp2.y = (vec_ref_max[index_x]
      .y -  posisi_x.y)/(vec_ref_max[index_x].y - vec_ref_median[index_x].y);

              }
              else // if (median.y < vec_ref_median.y) // rumus b
              {
                  posisi_x.y = (vec_ref_median[index_x].y - vec_ref_min[index_x].y) * max.y;
                  posisi_x.y = posisi_x.y / (max.y - median.y);
                  posisi_x.y = posisi_x.y + vec_ref_min[index_x].y;
                  posisi_x.y = posisi_x.y / ( 1 + (vec_ref_median[index_x]
      .y - vec_ref_min[index_x].y)/(max.y - median.y));

                  tp1.y = (max.y -  posisi_x.y)/(max.y - median.y);
                  tp2.y = (vec_ref_median[index_x]
      .y -  posisi_x.y)/(vec_ref_median[index_x].y - vec_ref_min[index_x].y);

              }
             
              if ((tp1.y > 0) && (tp1.y <= 1))
              {
                  output.y = tp1.y;
              }
              else if ((tp2.y>0) && (tp2.y<=1))
              {
                  output.y = tp2.y;
              }
              else
              {
                  output.y = 0;
              }
             
             
              if (median.z == vec_ref_median[index_x].z) // berarti posisi sudah sama persih
              {
                  output.z = 1.0f;
              }
              else if (median.z > vec_ref_median[index_x]
      .z) // rumus a
              {
                  posisi_x.z = (median.z - min.z) * vec_ref_max[index_x].z;
                  posisi_x.z = posisi_x.z / (vec_ref_max.z[index_x]
      - vec_ref_median[index_x].z);
                  posisi_x.z = posisi_x.z + min.z;
                  posisi_x.z = posisi_x.z / ( 1 + median.z - min.z)/(vec_ref_max[index_x]
      .z - vec_ref_median[index_x].z));

                  tp1.z = (max.z -  posisi_x.z)/(max.z - median.z);
                  tp2.z = (vec_ref_max[index_x]
      .z -  posisi_x.z)/(vec_ref_max[index_x].z - vec_ref_median[index_x].z);

              }
              else // if (median.z < vec_ref_median.z) // rumus b
              {
                  posisi_x.z = (vec_ref_median[index_x].z - vec_ref_min[index_x].z) * max.z;
                  posisi_x.z = posisi_x.z / (max.z - median.z);
                  posisi_x.z = posisi_x.z + vec_ref_min[index_x].z;
                  posisi_x.z = posisi_x.z / ( 1 + (vec_ref_median[index_x]
      .z - vec_ref_min[index_x].z)/(max.z - median.z));

                  tp1.z = (max.z -  posisi_x.z)/(max.z - median.z);
                  tp2.z = (vec_ref_median[index_x]
      .z -  posisi_x.z)/(vec_ref_median[index_x].z - vec_ref_min[index_x].z);

              }
             
              if ((tp1.z > 0) && (tp1.z <= 1))
              {
                  output.z = tp1.z;
              }
              else if ((tp2.z>0) && (tp2.z<=1))
              {
                  output.z = tp2.z;
              }
              else
              {
                  output.z = 0;
              }
             
             
              if (median.w == vec_ref_median[index_x].w) // berarti posisi sudah sama persih
              {
                  output.w = 1.0f;
              }
              else if (median.w > vec_ref_median[index_x]
      .w) // rumus a
              {
                  posisi_x.w = (median.w - min.w) * vec_ref_max[index_x].w;
                  posisi_x.w = posisi_x.w / (vec_ref_max.w[index_x]
      - vec_ref_median[index_x].w);
                  posisi_x.w = posisi_x.w + min.w;
                  posisi_x.w = posisi_x.w / ( 1 + median.w - min.w)/(vec_ref_max[index_x]
      .w - vec_ref_median[index_x].w));

                  tp1.w = (max.w -  posisi_x.w)/(max.w - median.w);
                  tp2.w = (vec_ref_max[index_x]
      .w -  posisi_x.w)/(vec_ref_max[index_x].w - vec_ref_median[index_x].w);

              }
              else // if (median.w < vec_ref_median.w) // rumus b
              {
                  posisi_x.w = (vec_ref_median[index_x].w - vec_ref_min[index_x].w) * max.w;
                  posisi_x.w = posisi_x.w / (max.w - median.w);
                  posisi_x.w = posisi_x.w + vec_ref_min[index_x].w;
                  posisi_x.w = posisi_x.w / ( 1 + (vec_ref_median[index_x]
      .w - vec_ref_min[index_x].w)/(max.w - median.w));

                  tp1.w = (max.w -  posisi_x.w)/(max.w - median.w);
                  tp2.w = (vec_ref_median[index_x]
      .w -  posisi_x.w)/(vec_ref_median[index_x].w - vec_ref_min[index_x].w);

              }
             
              if ((tp1.w > 0) && (tp1.w <= 1))
              {
                  output.w = tp1.w;
              }
              else if ((tp2.w>0) && (tp2.w<=1))
              {
                  output.w = tp2.w;
              }
              else
              {
                  output.w = 0;
              }
          }
         
          miu = output;
      }

      This kernel is about to be called 1000 times.

        • Is there anyway to optimize this kernel?
          genaganna

          Riza.guntur,

           

                You might get help if you explain algorithm and what you have done so far.

            • Is there anyway to optimize this kernel?
              riza.guntur

              Umm... Actually I've made some fixing with the code above and also the caller method because of algorithmically broken (and erroneous syntax errors, and that miu function still get)

              The algorithm named Fuzzy Neuro Linear Vector Quantization

              This what I've done so far:

              1. Fuzzify input training. Finding min, max, and median of group of five for each dimension of 16 for 480 input training with 80 samples per target). I've done it with reduction.

              2. Pick reference vector. I've done it with copy4 kernel, picking 1 sample as reference vector for each target, then I get 6 cluster, each has 16 dimension with its min, max, and median for each dimension.

              3. My next step here, I want to  calculate the miu for one input training which has 3 sufficient condition, if it has the same median, else if the input median bigger than reference vector median, and else if smaller and another condition if not intersect. I do it for each corresponding dimension (not cross-product) for each cluster.

              4. Finding winner cluster. The next step is finding the miu which is the smallest in each cluster. Then find the biggest miu. The cluster which has the biggest of smallest is the winner.

              5. Move the reference vector based on similarity. The following condition applies to the winner cluster it has miu value of 0 else if the winner cluster is the positioned the same as output else if different. In those condition, th cluster median (altogether) would be shifted nearer or farther, and while the min and max will be stretched out.

              6. Steps 3 to 5 would be applied to all input training fuzzy number (16 fuzzy vector)

              7. Steps 3 to 6 will be iterated 1000 times, or after errors reach near tolerance.

              After that, goes testing. The same as 3 to 4. Finding how much recognition I will get.

              I pretty confused with the data structure, the previous one has been replaced with current below.

              I've done these:

              #include "brookgenfiles/percobaan_pertama.h"
              #include
              #include
              #include
              using namespace std;
              using namespace brook;

              int
              main(int argc, char* argv[])
              {
                  unsigned int jumlahData = 480;
                  unsigned int jumlahDiSatuGrup = 5;
                  unsigned int jumlahDimensi = 16;
                  unsigned int jumlahOutput = 8;
                  unsigned int yA = jumlahData;
                  unsigned int yB = yA/jumlahDiSatuGrup;
                  unsigned int yC = jumlahOutput;
                  unsigned int streamSize[] = {yA,jumlahDimensi};
                  unsigned int streamSizeReduce[] = {yB,jumlahDimensi};
                  unsigned int streamSizeReduceRef[] = {yC,jumlahDimensi};
                  float alpha = 0.05f;
                  float elta = 1.1f;

                  unsigned int rank[2] = {1,2};

                  float3 *arr0 = new float3[jumlahDimensi*yA];
                  float3 *arr1 = new float3[jumlahDimensi*yB];
                  float3 *arr2 = new float3[jumlahDimensi*yC];
                  memset(arr0, 0, jumlahDimensi * yA * sizeof(float3));
                  memset(arr1, 0, jumlahDimensi * yB * sizeof(float3));
                  memset(arr2, 0, jumlahDimensi * yC * sizeof(float3));

                  ifstream inFile;
                  
                  inFile.open("train data.txt");
                  if (!inFile) {
                      cout << "Unable to open file";
                      exit(1); // terminate with error
                  }

                  for(unsigned int i = 0; i < jumlahDimensi; i++)
                  {
                      for(unsigned int j = 0; j < yA; j++)
                      {
                          unsigned int index = i * yA + j;
                          float temp;
                          if( inFile >> temp)
                              arr0[index].x = temp;
                      }
                  }

                  Stream input(rank[0], streamSize);//stream input training
                  Stream input_max_min(rank[0], streamSizeReduce);//y max, z min
                  Stream fuzzy_number(rank[0], streamSizeReduce);//x median
                  Stream vec_ref(rank[1], streamSizeReduceRef);//stream ref
                  Stream myu(rank[1], streamSizeReduceRef);//stream myu

                  streamRead(input,arr0);//memindahkan input ke stream
                  max_min(input,input_max_min);//mencari max min
                  fuzzify(input_max_min,fuzzy_number);//mencari median
                  copy3(fuzzy_number,vec_ref);//memasukkan max ke stream referensi


                  inFile.close();
                  delete[] arr0;
                  delete[] arr1;
                  delete[] arr2;

                  getchar();
                  return 0;
              }

              .br

              reduce void max_min(float3 input<>, reduce float3 output<>)
              {
                  if(input.x > output.y)
                  {
                      output.y = input.x;
                  }
                  if(input.x < output.z)
                  {
                      output.z = input.z;
                  }
              }

              kernel void fuzzify(float3 input<>, out float3 output<>)
              {
                  float3 temp;
                  temp.x = input.y + input.z;
                  temp.yz = input.yz;
                  output = temp;
              }

              kernel void copy3(float3 input<>, out float3 output<>)
              {
                  output = input;
              }

                • Is there anyway to optimize this kernel?
                  riza.guntur

                  Thanks for the correction. I was confuse at first why miu fail to be compiled.

                  Here is the main method code before that supposed to use the miu function:

                  #include "brookgenfiles/percobaan_pertama.h"
                  #include
                  #include
                  #include
                  using namespace std;
                  using namespace brook;

                  int
                  main(int argc, char* argv[])
                  {
                      /*
                      Segala pembagian dengan 4 menandakan penggunaan float4 pada fungsi
                      */
                      unsigned int jumlahData = 480;
                      unsigned int jumlahDiSatuGrup = 5;
                      unsigned int jumlahDimensi = 16;
                      unsigned int jumlahOutput = 8;
                      unsigned int yA = jumlahData/4;
                      unsigned int yB = yA/jumlahDiSatuGrup;
                      unsigned int streamSize[] = {yA, jumlahDimensi};
                      unsigned int streamSizeReduce[] = {yB,jumlahDimensi};
                      unsigned int streamSizeReduceRef[] = {jumlahOutput/4,jumlahDimensi};

                      unsigned int rank = 2;

                      float4 *arr0 = new float4[jumlahDimensi*yA];
                      float4 *arr1 = new float4[jumlahDimensi*yB];
                      float4 *arr2 = new float4[jumlahDimensi*yB];
                      float4 *arr3 = new float4[jumlahDimensi*yB];
                      memset(arr0, 0, jumlahDimensi * yA * sizeof(float4));
                      memset(arr1, 0, jumlahDimensi * yB * sizeof(float4));
                      memset(arr2, 0, jumlahDimensi * yB * sizeof(float4));
                      memset(arr3, 0, jumlahDimensi * yB * sizeof(float4));

                      ifstream inFile;
                      
                      inFile.open("train data.txt");
                      if (!inFile) {
                          cout << "Unable to open file";
                          exit(1); // terminate with error
                      }

                      for(unsigned int i = 0; i < jumlahDimensi; i++)
                      {
                          for(unsigned int j = 0; j < yA; j++)
                          {
                              unsigned int index = i * yA + j;
                              float temp;
                              if( inFile >> temp)
                                  arr0[index].x = temp;
                              if( inFile >> temp)
                                  arr0[index].y = temp;
                              if( inFile >> temp)
                                  arr0[index].z = temp;
                              if( inFile >> temp)
                                  arr0[index].w = temp;
                          }
                      }

                      Stream streami0(rank, streamSize);
                      Stream streami1(rank, streamSizeReduce);
                      Stream streami2(rank, streamSizeReduce);
                      Stream streami3(rank, streamSizeReduce);
                      Stream streami4(rank, streamSizeReduceRef);
                      Stream streami5(rank, streamSizeReduceRef);
                      Stream streami6(rank, streamSizeReduceRef);

                      streamRead(streami0,arr0);
                      max_reduce4(streami0,streami2);
                      min_reduce4(streami0,streami3);
                      mid_point4(streami2,streami3,streami1);
                      copy4(streami2,streami5);
                      copy4(streami3,streami6);
                      copy4(streami1,streami4);
                      streamWrite(streami1,arr1);
                      streamWrite(streami2,arr2);
                      streamWrite(streami3,arr3);

                      inFile.close();
                      delete[] arr0;
                      delete[] arr1;
                      delete[] arr2;
                      delete[] arr3;

                      getchar();
                      return 0;
                  }

                  The reason why I'm using float4 streams separately for median, min, max because at first I think it would compute to the reference vector. But I was wrong.

                  The instance().xy in miu function is meant to compared the input and winner cluster IF my algorithm wrong but the program structure would change a lot if that happens.

                  So I change to, I think, a better data structure.

                  P.S. : This previous main mathod has broken stream size and broken sample alignment (it is hard to think SIMD up until now). To get a big picture of correct problem please refer to current main method

                    • Is there anyway to optimize this kernel?
                      gaurav.garg

                      I could not follow the complete thread. But, I would give the following suggestions based on the your code-

                      1. Avoid using float3 datatype, instead use float4. float3 has lot more overhead in data transfer.

                      2. If you have a non-reduce kernel, all the streams (input or output declared with <> ) should have  same size. If they are of different size use gather streams instead. Streams with different size have to do stream resizing that has lot of overhead.

                      3. Try to combine multiple scalar instruction with swizzling on float4 datatype. It helps the compiler in better optimizations.

                • Is there anyway to optimize this kernel?
                  Jawed

                  How many elements are there in each of the following streams:

                  max
                  min
                  median
                  vec_ref_max
                  vec_ref_min
                  vec_ref_median
                  miu

                  The loop seems to be identical every time it executes, so it is superfluous.

                  Why are you using gather when every gather is from the same element? Gather is only useful when you compute a gather address. You never compute a gather address. Streams, by default, index their elements according to the position in the result stream, i.e. the position in miu determines the position in the three streams vec_ref_max, vec_ref_min and vec_ref_median, so you do not need to do any indexing if this is the only indexing required.

                  You can't name a kernel the same as the output stream - you have used "miu" for both.

                  I have revised your kernel, calling it miucalcJA01, guessing your intentions, and I use a kernel called evaluate which is called four times. I have restructured the IF statements for higher performance (but I don't have hardware in order to test for performance, so this is a guess based upon Stream Kernel Analyzer):

                  kernel float
                  evaluate( float max, float min, float median,
                     float vec_ref_max,
                     float vec_ref_min,
                     float vec_ref_median)
                  {
                   float output = 0.f ;
                   float tp1 = 0.f ;
                   float tp2 = 0.f ;
                   float posisi_x ;

                   if (median == vec_ref_median) // berarti posisi sudah sama persih
                   {
                    output = 1.0f;
                   }
                   else if (median > vec_ref_median) // rumus a
                   {
                    posisi_x = (median - min) * vec_ref_max;
                    posisi_x = posisi_x / (vec_ref_max - vec_ref_median);
                    posisi_x = posisi_x + min;
                    posisi_x = posisi_x / ( 1.f + median - min)/(vec_ref_max - vec_ref_median);

                    tp1 = (max -  posisi_x)/(max - median);
                    tp2 = (vec_ref_max -  posisi_x)/(vec_ref_max - vec_ref_median);
                    if ((tp1 > 0.f) && (tp1 <= 1.f))
                    {
                     output = tp1;
                    }
                    else if ((tp2>0.f) && (tp2<=1.f))
                    {
                     output = tp2;
                    }
                   }
                   else // rumus b
                   {
                    posisi_x = (vec_ref_median - vec_ref_min) * max;
                    posisi_x = posisi_x / (max - median);
                    posisi_x = posisi_x + vec_ref_min;
                    posisi_x = posisi_x / ( 1.f + (vec_ref_median - vec_ref_min)/(max - median));

                    tp1 = (max -  posisi_x)/(max - median);
                    tp2 = (vec_ref_median -  posisi_x)/(vec_ref_median - vec_ref_min);

                    if ((tp1 > 0.f) && (tp1 <= 1.f))
                    {
                     output = tp1;
                    }
                    else if ((tp2>0.f) && (tp2<=1.f))
                    {
                     output = tp2;
                    }
                   }
                   return output ;
                  }

                  kernel void
                  miucalcJA01(float4 max, float4 min, float4 median,
                     float4 vec_ref_max<>,
                     float4 vec_ref_min<>,
                     float4 vec_ref_median<>,
                     out float4 miu<>)
                  {
                   miu.x=evaluate(max.x,min.x,median.x,vec_ref_max.x,vec_ref_min.x,vec_ref_median.x) ;
                   miu.y=evaluate(max.y,min.y,median.y,vec_ref_max.y,vec_ref_min.y,vec_ref_median.y) ;
                   miu.z=evaluate(max.z,min.z,median.z,vec_ref_max.z,vec_ref_min.z,vec_ref_median.z) ;
                   miu.w=evaluate(max.w,min.w,median.w,vec_ref_max.w,vec_ref_min.w,vec_ref_median.w) ;
                  }