# How can I optimise my algorithm?

Question asked by jzuber on Oct 3, 2012
Latest reply on Oct 24, 2012 by notzed

I am trying to increase throughput by using AMD GPGPU and OpenCL, and I am getting less performance than a CPU using a loop. Also take note: this is my first openCL kernel so I understand if it is complete garbage.

Essentially I get large amounts of data, and for each of those data objects, I need to run some conversion, then compare to a 512*7 sized look-up table, and do a sum of squares of each of those 7 values (I will illustrate below).  Currently I want to see the gain I can get with a less powerful GPGPU (Radeon 6570), with the hopes that any gain at all given by this will only get better as the GPGPU I select gets better.

Here is my Kernel code:

```  #define LUT_0_1_TABLE_ROW_SIZE          512
#define LUT_2_TABLE_COL_SIZE            12
#define LUT_2_TABLE_ROW_SIZE            207
#define LUT_2_TABLE_SIZE                LUT_2_TABLE_COL_SIZE*LUT_2_TABLE_ROW_SIZE

struct object
{
float m_fval0;
float m_fval1;
float m_fval2;
float m_fval3;
float m_fval4;
float m_fval5;
float m_fval6;
float m_fval7;
unsigned int m_uval0;
unsigned int m_uval1;
unsigned int m_uval2;
unsigned int m_uval3;
unsigned int m_uval4;
};

ushort getLut2TableIndex(float fin0, float fin1)
{
float fMin;
ushort tableIndex = (ushort)(fin1/7); // use tempIndex as a temp var.....Hacks!!
ushort minIdx = min(max(tableIndex, (ushort)0), (ushort)7);
tableIndex = (minIdx == 1 ? 2 : minIdx);  // compensate for the fact that the "1" position doesn't exist
tableIndex *= 26;
tableIndex -= (minIdx < 2 ? 1 : 0);  // compensation
fMin = (fin0- minIdx);
tableIndex += (ushort)fMin;

// Make sure the idx is in a valid range - otherwise we can crash!
tableIndex = min(max(minIdx, (ushort)0), (ushort)(LUT_1_TABLE_ROW_SIZE - 1));  // clamp returns the value inside the min and max
// not using clamp because of using openCL 1.0 -> final release will use openCL 1.1....
return tableIndex;
}

__kernel void convertPdwsAndPerformAoa(global uint8 * input0, global uint8 * input1, global uint8 * input2, global uint8 * input3,
global struct object * output, constant float * lut0, constant float4 * lut1,
constant float * lut2, float fin0, local float4 * lut1Local)
{
float ftemp0 = -9.3f;
float ftemp1 = 12.0f;
int itemp0 = 0;
float tempLog = 0.0f;
float4 f4temp0 = { 0.0f, 0.0f, 0.0f, 0.0f };
float4 f4temp1 = { 0.0f, 0.0f, 0.0f, 0.0f };
float8 f8temp0 = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
float8 f8temp1 = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
uint8 input = input3[gid];
struct object tempOut;
ushort lut2ColGroupIndex = 0;
ushort lut2RowIdx;
ushort lut0RowIdx = 0;
float sumOfSquares;
float minSum = MAXFLOAT;

uint gid = get_global_id(0);
uint lid = get_local_id(0);
uint nloc = get_local_size(0);

// Copy the lut1 global vector into a local vector
for (unsigned int i = lid; i < LUT_0_1_TABLE_ROW_SIZE; i += nloc)
{
lut1Local[i] = lut1[i];
}
barrier(CLK_LOCAL_MEM_FENCE);

// Conversion code -> this is fast no need to look at!
// .................
//  **************************

lut2RowIdx = getLut2TableIndex(tempOut.m_fval0, fin0);
f4temp0.s0 = f4temp0.s0/f4temp1.s0;
f4temp0.s1 = f4temp0.s1/f4temp1.s1 + lut2[lut2RowIdx + 3*lut2ColGroupIndex];
f4temp0.s2 = f4temp0.s2/f4temp1.s2 + lut2[lut2RowIdx + 3*lut2ColGroupIndex + 1];
f4temp0.s3 = f4temp0.s3/f4temp1.s3 + lut2[lut2RowIdx + 3*lut2ColGroupIndex + 2];

f8temp0.s0 = f4temp0.s0 - f4temp0.s1;
f8temp0.s1 = f4temp0.s0 - f4temp0.s2;
f8temp0.s2 = f4temp0.s0 - f4temp0.s3;
f8temp0.s3 = f4temp0.s1 - f4temp0.s2;
f8temp0.s4 = f4temp0.s2 - f4temp0.s3;
f8temp0.s5 = f4temp0.s1 - f4temp0.s3;
f8temp0.s6 = f8temp0.s3 - f8temp0.s4;

// Calculate the cost function
for (unsigned short idx = 0; idx < LUT_0_1_TABLE_ROW_SIZE; ++idx)
{
// Calculate the lut1 components
f8temp1.s0 = lut1Local[idx].s0*tempOut.m_fval0;
f8temp1.s1 = lut1Local[idx].s1*tempOut.m_fval0;
f8temp1.s2 = lut1Local[idx].s2*tempOut.m_fval0;
f8temp1.s3 = f8temp1.s0 - f8temp1.s1;
f8temp1.s4 = f8temp1.s0 - f8temp1.s2;
f8temp1.s5 = f8temp1.s0 - f8temp1.s3;
f8temp1.s6 = f8temp1.s3 - f8temp1.s4;

f8temp0 -= f8temp1;

f8temp1 += (f8temp1 > (float8)100.0f ? (float8)-5.0f : (float8)0.0f);

// Square all elements of the vector
f8temp1 *= f8temp1;

// Sum all elements of the vector together
sumOfSquares = f8temp1.s0;
sumOfSquares += f8temp1.s1;
sumOfSquares += f8temp1.s2;
sumOfSquares += f8temp1.s3;
sumOfSquares += f8temp1.s4;
sumOfSquares += f8temp1.s5;
sumOfSquares += f8temp1.s6;

// If the new sum is < the old min, then this is the new min!  Update the idx and the minSum.
lut0RowIdx = (minSum < sumOfSquares ? lut0RowIdx : idx);
minSum = fmin(minSum, sumOfSquares);
}
lut0RowIdx = min(max(lut0RowIdx, (ushort)0), (ushort)(LUT_0_1_TABLE_ROW_SIZE - 1)); // clamp
tempOut.m_fin4 = lut0[lut0RowIdx];
output[gid] = tempOut;
}
```

Anyway I obfuscated my code a bit, so there may be some things which don't make a lot of sense, but I need to do all those operations.  One option I was looking at was have one kernel to do the conversion, then another for the cost function -> though the cpu would have to loop through and run the cost function kernel  for each of the intended objects.

Thanks for taking a look!