honghong

Performance in Brook+

Discussion created by honghong on Jan 6, 2009
Latest reply on Feb 5, 2009 by honghong

i  have tried a sdot programm with Brook+ to write. but the performance is so bad. i have input vector Dim 1 000 000 and the perfromance is about 3 MFlops.

 

hier ist my code:

 

kernel void sdot_01(float width, float a[], float b[], out float c<>

 float res = 0.0f;
 float vPos = indexof(c).x;
 float index = vPos.x;
 float step = 1.0f;

 float i0 = width;
 while(i0 > 0)
 {
  res += a[index.x]*b[index.x];
  index += step;
  i0 = i0 - 1.0f;
 }

 c = res;
}

 

 

kernel void sdot_02(float width, float a[], float b[], out float c<>
{
 float res = 0.0f;
 float vPos = indexof(c).x;
 float index = vPos.x;
 float step = 1.0f;

 float i0 = width;
 while(i0 > 0)
 {
  res += dot(a[index.x], b[index.x]);
  index += step;
  i0 = i0 - 1.0f;
 }

 c = res;
}

 

 

kernel void sdot_03(float loopVar0, float4 A[],float4 B[],out float C<>
{
 float res = 0.0f;
 float vPos = indexof(C).x;
 float index = vPos.x;
 float step = 1.0f;

 float i0 = loopVar0;
 while(i0 > 0.0f)
 {
  // Fetching values from A:
  float4 A1 = A[index.x];

  // Fetching values from B:
  float4 B1 = B[index.x];

  res += A1.xxxx * B1.xyzw + A1.yyyy * B1.xyzw + A1.zzzz * B1.xyzw + A1.wwww * B1.xyzw;  
   
  index += step;   
  i0 = i0 - 1.0f;
 }

 C = res;
}


kernel void sdot_04(float loopVar0, float4 a1[], float4 a2[], float4 a3[], float4 a4[], float4 a5[], float4 a6[], float4 a7[], float4 a8[],
      float4 b1[], float4 b2[], float4 b3[], float4 b4[], float4 b5[], float4 b6[], float4 b7[], float4 b8[],
      out float c1<>, out float c2<>, out float c3<>, out float c4<>, out float c5<>, out float c6<>, out float c7<>, out float c8<>
{
 float vPos = indexof(c1).x;
 float index = vPos.x;
 float step = 1.0f;

 // Declaring and initializing accumulators
 float accumulator1 = 0.0f;
 float accumulator2 = 0.0f;
 float accumulator3 = 0.0f;
 float accumulator4 = 0.0f;
 float accumulator5 = 0.0f;
 float accumulator6 = 0.0f;
 float accumulator7 = 0.0f;
 float accumulator8 = 0.0f;

 float i0 = loopVar0;
 while(i0 > 0.0f)
 {
  // Fetching values from a:
  float4 A1 = a1[index.x];
  float4 A2 = a2[index.x];
  float4 A3 = a3[index.x];
  float4 A4 = a4[index.x];
  float4 A5 = a5[index.x];
  float4 A6 = a6[index.x];
  float4 A7 = a7[index.x];
  float4 A8 = a8[index.x];

  // Fetching values from b:
  float4 B1 = b1[index.x];
  float4 B2 = b2[index.x];
  float4 B3 = b3[index.x];
  float4 B4 = b4[index.x];
  float4 B5 = b5[index.x];
  float4 B6 = b6[index.x];
  float4 B7 = b7[index.x];
  float4 B8 = b8[index.x];

  accumulator1 += A1.xxxx * B1.xyzw + A1.yyyy * B1.xyzw + A1.zzzz * B1.xyzw + A1.wwww * B1.xyzw;  
  accumulator2 += A2.xxxx * B2.xyzw + A2.yyyy * B2.xyzw + A2.zzzz * B2.xyzw + A2.wwww * B2.xyzw;
  accumulator3 += A3.xxxx * B3.xyzw + A3.yyyy * B3.xyzw + A3.zzzz * B3.xyzw + A3.wwww * B3.xyzw;
  accumulator4 += A4.xxxx * B4.xyzw + A4.yyyy * B4.xyzw + A4.zzzz * B4.xyzw + A4.wwww * B4.xyzw;
  accumulator5 += A5.xxxx * B5.xyzw + A5.yyyy * B5.xyzw + A5.zzzz * B5.xyzw + A5.wwww * B5.xyzw;
  accumulator6 += A6.xxxx * B6.xyzw + A6.yyyy * B6.xyzw + A6.zzzz * B6.xyzw + A6.wwww * B6.xyzw;
  accumulator7 += A7.xxxx * B7.xyzw + A7.yyyy * B7.xyzw + A7.zzzz * B7.xyzw + A7.wwww * B7.xyzw;
  accumulator8 += A8.xxxx * B8.xyzw + A8.yyyy * B8.xyzw + A8.zzzz * B8.xyzw + A8.wwww * B8.xyzw;

  index += step;   
  i0 = i0 - 1.0f;
 }

 c1 = accumulator1;
 c2 = accumulator2;
 c3 = accumulator3;
 c4 = accumulator4;
 c5 = accumulator5;
 c6 = accumulator6;
 c7 = accumulator7;
 c8 = accumulator8;
}

 

Do you know what is happening? Or I have made error. my pc 2x2.4GHz and grafik card is ATI3870.

 

do you have any documents for brook+ programming semantic?

 

thank you for your answer.

Outcomes