AnsweredAssumed Answered

How can I optimise my algorithm?

Question asked by jzuber on Oct 3, 2012
Latest reply on Oct 24, 2012 by notzed

I am trying to increase throughput by using AMD GPGPU and OpenCL, and I am getting less performance than a CPU using a loop. Also take note: this is my first openCL kernel so I understand if it is complete garbage.

 

Essentially I get large amounts of data, and for each of those data objects, I need to run some conversion, then compare to a 512*7 sized look-up table, and do a sum of squares of each of those 7 values (I will illustrate below).  Currently I want to see the gain I can get with a less powerful GPGPU (Radeon 6570), with the hopes that any gain at all given by this will only get better as the GPGPU I select gets better.

 

Here is my Kernel code:

 

  #define LUT_0_1_TABLE_ROW_SIZE          512                                                                                                            
  #define LUT_2_TABLE_COL_SIZE            12                                                                                                             
  #define LUT_2_TABLE_ROW_SIZE            207                                                                                                            
  #define LUT_2_TABLE_SIZE                LUT_2_TABLE_COL_SIZE*LUT_2_TABLE_ROW_SIZE                                                                    


  struct object
  {                                                                                                                                               
     float m_fval0;                                                                                                                             
     float m_fval1;                                                                                                                            
     float m_fval2;                                                                                                                           
     float m_fval3;                                                                                                                            
     float m_fval4;                                                                                                                            
     float m_fval5;                                                                                                                          
     float m_fval6;                                                                                                                          
     float m_fval7;                                                                                                                              
     unsigned int m_uval0;                                                                                                                         
     unsigned int m_uval1;                                                                                                                         
     unsigned int m_uval2;                                                                                                                         
     unsigned int m_uval3;                                                                                                                           
     unsigned int m_uval4;                                                                                                                      
  };                                                                                                                                              

 ushort getLut2TableIndex(float fin0, float fin1)                                           
  {                                                                                                                                               
     float fMin;                                                                                                                                  
     ushort tableIndex = (ushort)(fin1/7); // use tempIndex as a temp var.....Hacks!!                                     
     ushort minIdx = min(max(tableIndex, (ushort)0), (ushort)7);                                                                                  
     tableIndex = (minIdx == 1 ? 2 : minIdx);  // compensate for the fact that the "1" position doesn't exist                  
     tableIndex *= 26;                                                     
     tableIndex -= (minIdx < 2 ? 1 : 0);  // compensation             
     fMin = (fin0- minIdx);                                                                                       
     tableIndex += (ushort)fMin;                                                                                                                  


     // Make sure the idx is in a valid range - otherwise we can crash!                                                                           
     tableIndex = min(max(minIdx, (ushort)0), (ushort)(LUT_1_TABLE_ROW_SIZE - 1));  // clamp returns the value inside the min and max 
     // not using clamp because of using openCL 1.0 -> final release will use openCL 1.1....           
    return tableIndex;                                                                                                                           
  }                                                                                                                                                                          


  __kernel void convertPdwsAndPerformAoa(global uint8 * input0, global uint8 * input1, global uint8 * input2, global uint8 * input3,                
                                                                                                    global struct object * output, constant float * lut0, constant float4 * lut1,                           
                                                                                                    constant float * lut2, float fin0, local float4 * lut1Local)                               
{                                                                                                                                                 
  float ftemp0 = -9.3f;                                                                           
  float ftemp1 = 12.0f;                                                                               
  int itemp0 = 0;                                                                                                                           
  float tempLog = 0.0f;                                                                                                                           
  float4 f4temp0 = { 0.0f, 0.0f, 0.0f, 0.0f };                                                                         
  float4 f4temp1 = { 0.0f, 0.0f, 0.0f, 0.0f };                                                                                                                           
  float8 f8temp0 = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };                                                                             
  float8 f8temp1 = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };                                                                                                        
  uint8 input = input3[gid];   
  struct object tempOut;                                                                                                                                 
  ushort lut2ColGroupIndex = 0;                                                                                                                      
  ushort lut2RowIdx;                                                                                                                                 
  ushort lut0RowIdx = 0;                                                                                                                                 
  float sumOfSquares;                                                                                                                             
  float minSum = MAXFLOAT;

  uint gid = get_global_id(0);                                                                                                                  
  uint lid = get_local_id(0);                                                                                                                     
  uint nloc = get_local_size(0);                                                                                                                    

  // Copy the lut1 global vector into a local vector                                                                                              
  for (unsigned int i = lid; i < LUT_0_1_TABLE_ROW_SIZE; i += nloc)                                                                                      
  {                                                                                                                                               
     lut1Local[i] = lut1[i];                                                                                                                      
  }                                                                                                                                               
  barrier(CLK_LOCAL_MEM_FENCE);    

  // Conversion code -> this is fast no need to look at!    
  // .................  
  //  **************************

  lut2RowIdx = getLut2TableIndex(tempOut.m_fval0, fin0);                                                         
  f4temp0.s0 = f4temp0.s0/f4temp1.s0;                                                                                                     
  f4temp0.s1 = f4temp0.s1/f4temp1.s1 + lut2[lut2RowIdx + 3*lut2ColGroupIndex];                                                                 
  f4temp0.s2 = f4temp0.s2/f4temp1.s2 + lut2[lut2RowIdx + 3*lut2ColGroupIndex + 1];                                                             
  f4temp0.s3 = f4temp0.s3/f4temp1.s3 + lut2[lut2RowIdx + 3*lut2ColGroupIndex + 2];                                                             

  f8temp0.s0 = f4temp0.s0 - f4temp0.s1;                                                                                                        
  f8temp0.s1 = f4temp0.s0 - f4temp0.s2;                                                                                                        
  f8temp0.s2 = f4temp0.s0 - f4temp0.s3;                                                                                                        
  f8temp0.s3 = f4temp0.s1 - f4temp0.s2;                                                                                                        
  f8temp0.s4 = f4temp0.s2 - f4temp0.s3;                                                                                                        
  f8temp0.s5 = f4temp0.s1 - f4temp0.s3;                                                                                                        
  f8temp0.s6 = f8temp0.s3 - f8temp0.s4;                                                                                                            


  // Calculate the cost function                                                                                                                  
  for (unsigned short idx = 0; idx < LUT_0_1_TABLE_ROW_SIZE; ++idx)                                                                                       
  {                                                                                                                                               
     // Calculate the lut1 components                                                                                                             
     f8temp1.s0 = lut1Local[idx].s0*tempOut.m_fval0;                                                                                             
     f8temp1.s1 = lut1Local[idx].s1*tempOut.m_fval0;                                                                                              
     f8temp1.s2 = lut1Local[idx].s2*tempOut.m_fval0;                                                                                              
     f8temp1.s3 = f8temp1.s0 - f8temp1.s1;                                                                                               
     f8temp1.s4 = f8temp1.s0 - f8temp1.s2;                                                                                               
     f8temp1.s5 = f8temp1.s0 - f8temp1.s3;                                                                                               
     f8temp1.s6 = f8temp1.s3 - f8temp1.s4;                                                                                               

     f8temp0 -= f8temp1;  

     f8temp1 += (f8temp1 > (float8)100.0f ? (float8)-5.0f : (float8)0.0f);                                                                

           // Square all elements of the vector                                                                                                         
     f8temp1 *= f8temp1;       

           // Sum all elements of the vector together                                                                                                  
     sumOfSquares = f8temp1.s0;                                                                                                                
     sumOfSquares += f8temp1.s1;                                                                                                               
     sumOfSquares += f8temp1.s2;                                                                                                               
     sumOfSquares += f8temp1.s3;                                                                                                               
     sumOfSquares += f8temp1.s4;                                                                                                               
     sumOfSquares += f8temp1.s5;                                                                                                               
     sumOfSquares += f8temp1.s6;                                                                                                               

          // If the new sum is < the old min, then this is the new min!  Update the idx and the minSum.                                                
     lut0RowIdx = (minSum < sumOfSquares ? lut0RowIdx : idx);                                                                                             
     minSum = fmin(minSum, sumOfSquares);                                                                                                         
  }                                                                                                                                               
  lut0RowIdx = min(max(lut0RowIdx, (ushort)0), (ushort)(LUT_0_1_TABLE_ROW_SIZE - 1)); // clamp                                                                    
  tempOut.m_fin4 = lut0[lut0RowIdx];                                                                                                             
  output[gid] = tempOut;                                                                                                                           
}  

 

Anyway I obfuscated my code a bit, so there may be some things which don't make a lot of sense, but I need to do all those operations.  One option I was looking at was have one kernel to do the conversion, then another for the cost function -> though the cpu would have to loop through and run the cost function kernel  for each of the intended objects.

 

Thanks for taking a look!

Outcomes