9 Replies Latest reply on Feb 5, 2009 1:15 PM by honghong

    Performance in Brook+

    honghong

      i  have tried a sdot programm with Brook+ to write. but the performance is so bad. i have input vector Dim 1 000 000 and the perfromance is about 3 MFlops.

       

      hier ist my code:

       

      kernel void sdot_01(float width, float a[], float b[], out float c<>

       float res = 0.0f;
       float vPos = indexof(c).x;
       float index = vPos.x;
       float step = 1.0f;

       float i0 = width;
       while(i0 > 0)
       {
        res += a[index.x]*b[index.x];
        index += step;
        i0 = i0 - 1.0f;
       }

       c = res;
      }

       

       

      kernel void sdot_02(float width, float a[], float b[], out float c<>
      {
       float res = 0.0f;
       float vPos = indexof(c).x;
       float index = vPos.x;
       float step = 1.0f;

       float i0 = width;
       while(i0 > 0)
       {
        res += dot(a[index.x], b[index.x]);
        index += step;
        i0 = i0 - 1.0f;
       }

       c = res;
      }

       

       

      kernel void sdot_03(float loopVar0, float4 A[],float4 B[],out float C<>
      {
       float res = 0.0f;
       float vPos = indexof(C).x;
       float index = vPos.x;
       float step = 1.0f;

       float i0 = loopVar0;
       while(i0 > 0.0f)
       {
        // Fetching values from A:
        float4 A1 = A[index.x];

        // Fetching values from B:
        float4 B1 = B[index.x];

        res += A1.xxxx * B1.xyzw + A1.yyyy * B1.xyzw + A1.zzzz * B1.xyzw + A1.wwww * B1.xyzw;  
         
        index += step;   
        i0 = i0 - 1.0f;
       }

       C = res;
      }


      kernel void sdot_04(float loopVar0, float4 a1[], float4 a2[], float4 a3[], float4 a4[], float4 a5[], float4 a6[], float4 a7[], float4 a8[],
            float4 b1[], float4 b2[], float4 b3[], float4 b4[], float4 b5[], float4 b6[], float4 b7[], float4 b8[],
            out float c1<>, out float c2<>, out float c3<>, out float c4<>, out float c5<>, out float c6<>, out float c7<>, out float c8<>
      {
       float vPos = indexof(c1).x;
       float index = vPos.x;
       float step = 1.0f;

       // Declaring and initializing accumulators
       float accumulator1 = 0.0f;
       float accumulator2 = 0.0f;
       float accumulator3 = 0.0f;
       float accumulator4 = 0.0f;
       float accumulator5 = 0.0f;
       float accumulator6 = 0.0f;
       float accumulator7 = 0.0f;
       float accumulator8 = 0.0f;

       float i0 = loopVar0;
       while(i0 > 0.0f)
       {
        // Fetching values from a:
        float4 A1 = a1[index.x];
        float4 A2 = a2[index.x];
        float4 A3 = a3[index.x];
        float4 A4 = a4[index.x];
        float4 A5 = a5[index.x];
        float4 A6 = a6[index.x];
        float4 A7 = a7[index.x];
        float4 A8 = a8[index.x];

        // Fetching values from b:
        float4 B1 = b1[index.x];
        float4 B2 = b2[index.x];
        float4 B3 = b3[index.x];
        float4 B4 = b4[index.x];
        float4 B5 = b5[index.x];
        float4 B6 = b6[index.x];
        float4 B7 = b7[index.x];
        float4 B8 = b8[index.x];

        accumulator1 += A1.xxxx * B1.xyzw + A1.yyyy * B1.xyzw + A1.zzzz * B1.xyzw + A1.wwww * B1.xyzw;  
        accumulator2 += A2.xxxx * B2.xyzw + A2.yyyy * B2.xyzw + A2.zzzz * B2.xyzw + A2.wwww * B2.xyzw;
        accumulator3 += A3.xxxx * B3.xyzw + A3.yyyy * B3.xyzw + A3.zzzz * B3.xyzw + A3.wwww * B3.xyzw;
        accumulator4 += A4.xxxx * B4.xyzw + A4.yyyy * B4.xyzw + A4.zzzz * B4.xyzw + A4.wwww * B4.xyzw;
        accumulator5 += A5.xxxx * B5.xyzw + A5.yyyy * B5.xyzw + A5.zzzz * B5.xyzw + A5.wwww * B5.xyzw;
        accumulator6 += A6.xxxx * B6.xyzw + A6.yyyy * B6.xyzw + A6.zzzz * B6.xyzw + A6.wwww * B6.xyzw;
        accumulator7 += A7.xxxx * B7.xyzw + A7.yyyy * B7.xyzw + A7.zzzz * B7.xyzw + A7.wwww * B7.xyzw;
        accumulator8 += A8.xxxx * B8.xyzw + A8.yyyy * B8.xyzw + A8.zzzz * B8.xyzw + A8.wwww * B8.xyzw;

        index += step;   
        i0 = i0 - 1.0f;
       }

       c1 = accumulator1;
       c2 = accumulator2;
       c3 = accumulator3;
       c4 = accumulator4;
       c5 = accumulator5;
       c6 = accumulator6;
       c7 = accumulator7;
       c8 = accumulator8;
      }

       

      Do you know what is happening? Or I have made error. my pc 2x2.4GHz and grafik card is ATI3870.

       

      do you have any documents for brook+ programming semantic?

       

      thank you for your answer.

        • Performance in Brook+
          BarsMonster

          Probably CPU<>GPU bandwidth is a bottleneck.

          Could you show your kernel execution code?

            • Performance in Brook+
              honghong

              Hello BarsMonster!

              thank you for your answer!

              but how can i show my kernel execution code?

              regards

                • Performance in Brook+
                  BarsMonster

                  Show code which executes sdot_04

                    • Performance in Brook+
                      honghong

                      i  have tried a kernel 100 Iterations .  have input vector Dim 1 000 000 and the perfromance is about 70 MFlops. show better, but the performance is auch so bad. Anybody can help me. thanks!!

                      kernel void sdot_02(float width, float a[], float b[], out float c<>
                      {
                       float res = 0.0f;
                       float vPos = indexof(c).x;
                       float index = vPos.x;
                       float step = 1.0f;

                       float i0 = width;
                       while(i0 > 0)
                       {
                        res += dot(a[index.x], b[index.x]);
                        index += step;
                        i0 = i0 - 1.0f;
                       }

                       c = res;
                      }

                        • Performance in Brook+
                          rick.weber

                          honghong,

                          Your performance issues are probably related to two things:

                          First, as BarsMonster pointed out, the time it takes to transfer your input vectors and to fetch the output are probably much larger than the actual computation. I?l probably get into trouble for saying this, but for a problem see a speedup on a GPU, it needs to be significantly complex in order to amortize the cost of data transfer. A dot product is just not complex enough (you do 1 operation per value that you pass to the GPU). I would guess that your problem needs to be at least O(N² to overcome the cost of data transfer.

                          Secondly, as you have implemented it, there is absolutely no parallelism. GPUs acheive speedups by exploiting parallelism in code. I would write a kernel that multiplies c=a*b and then write another kernel that does a reduction. Brook+ inherently supports reduction kernels, which you can find examples of on this forum that will allow you to still extract some parallelism from the addition.

                            • Performance in Brook+
                              gaurav.garg

                              Could you post your runtime side code. e.g. What are your stream dimensions?

                                • Performance in Brook+
                                  honghong

                                  Hi rick.weber and gaurav.garg!

                                  thank  you for your answer! 

                                   

                                  I want to do a dot product operation. so the stream must be 1D.

                                  float d1ArrayA< Length >;
                                     float d1ArrayB< Length >;
                                     float d1ArrayC< 1 >;   

                                     Start(1);

                                     // Read the data into the streams from the input matrices
                                     streamRead(d1ArrayA, d1Input[0]);
                                     streamRead(d1ArrayB, d1Input[1]);

                                     // Perform the kernel operation   
                                     for (i = 0; i < cmd.Iterations; ++i)
                                     {
                                      sdot_02((float)Length, d1ArrayA, d1ArrayB, d1ArrayC);
                                     }
                                     
                                     // Write the data from the streams into the output matrices
                                     streamWrite(d1ArrayC, d1Output);

                                     Stop(1);
                                    }


                                    printf("\nSDOT_02 in GPU:%f  \n",d1Output[0]);

                                    if(cmd.Timing)
                                    {
                                     flops = 2.0f*Width;
                                     gpu_time = GetElapsedTime(1);
                                     printf("Leistung von GPU...\n");
                                     printf("\nMDIM: %d,   Time: %lf seconds,  %lf MFlops\n\n",j,gpu_time,1.0e-6*flops/gpu_time);
                                    }

                                   

                                   

                                   

                                    • Performance in Brook+
                                      gaurav.garg

                                      So, you are using a sinle element output stream, that would mean you are actually using a single stream core(out of 800). As mentioned by rick.weber, you should split your algorithm in two kernel (one kernel for dot product and another for reduction). Still, I don't see much performance improvement with it, as GPU computation is dependent on high ALU/Memory ratio of your algorithm and you seems to be doing a single ALU op with two memory fetches.

                                        • Performance in Brook+
                                          honghong

                                          hi,

                                          for sdot product have i  changed and see the performance better . and i have tried the vector DIM from 1e4-- 1e7 and got the performance 500MFlops~1100MFlops. anybody can change for much performance improvement.

                                          kernel void sdot_01(float a<>, float b<>, out float c<>
                                          {
                                           c = a*b;
                                          }

                                          reduce void sum(float a<>, reduce float res<>
                                          {
                                           res += a;
                                          }