12 Replies Latest reply on Feb 9, 2009 6:31 PM by ryta1203

    Another Performance Question

    ryta1203
      I have two very simple kernels:

      kernel void step1(float4 a<>, float4 b<>, out float c<>, out float d<>)
      {
      c = a.x + a.y + a.z + a.w;
      d = b.x + b.y + b.z + b.w;
      }

      kernel void step2(float4 a<>, float4 b<>, out float4 e<>)
      {
      e.x = a.x + a.y + a.z + a.w;
      e.y = b.x + b.y + b.z + b.w;
      }

      the size of all the streams are the same, lets say 2048. I iterate over the kernels 2048 times (just to get longer timing results for the GPU).

      My question is this:

      Why does the first kernel run significantly faster than the second kernel?

      Looking at the KSA, the GPR is lower for the 2nd kernel and the ALU:Fetch is 1.25 for the 2nd kernel and 2.5 for the 1st kernel. Since the GPR is higher for the 1st kernel than the wavefronts in the run queue are going to be less. The KSA says the throughput should be higher for the 2nd kernel along with threads/clock (which is another reason the KSA "measurables" don't speak much about performance).


      My only guess would be that the higher ALU:Fetch ratio is allowing latency hiding across wavefronts in the GPU, is that an accurate statement?
        • Another Performance Question
          MicahVillmow
          Ryta,
          That could be the case, but also you might be having issues with it doing two writes to e instead of one. try copying the data to a float4 before writing to e and see if that improves performance. KSA is correct when it states that the second one should have higher throughput, but there might be other issues causing it go slower. I would actually expect the step2 kernel to run close to full speed and step1 to run at 1/2-3/4 of step2 because it has two output writes compared to one.
          • Another Performance Question
            MicahVillmow
            Can you post your IL kernels here for those two kernels so I can see what is really going on?
              • Another Performance Question
                ryta1203
                il_ps_2_0
                ; l0 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l0, 0x00000000, 0x00000000, 0x00000000, 0x00000000
                ; l1 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l1, 0x00000001, 0x00000001, 0x00000001, 0x00000001
                ; l2 = (-1.#QNAN0f -1.#QNAN0f -1.#QNAN0f -1.#QNAN0f)
                dcl_literal l2, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
                ; l3 = (1.#QNAN0f 1.#QNAN0f 1.#QNAN0f 1.#QNAN0f)
                dcl_literal l3, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
                ; l4 = (1.#INF00f 1.#INF00f 1.#INF00f 1.#INF00f)
                dcl_literal l4, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000
                ; l5 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l5, 0x80000000, 0x80000000, 0x80000000, 0x80000000
                ; l6 = (0.301030f 0.301030f 0.301030f 0.301030f)
                dcl_literal l6, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B
                ; l7 = (0.693147f 0.693147f 0.693147f 0.693147f)
                dcl_literal l7, 0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218
                ; l8 = (3.141593f 3.141593f 3.141593f 3.141593f)
                dcl_literal l8, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
                ; l9 = (1.570796f 1.570796f 1.570796f 1.570796f)
                dcl_literal l9, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB
                ; l10 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l10, 0x00000003, 0x00000003, 0x00000003, 0x00000003
                ; l11 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l11, 0x00000002, 0x00000002, 0x00000002, 0x00000002
                dcl_output_generic o0
                dcl_output_generic o1
                dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
                dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
                dcl_input_position_interp(linear_noperspective) v0.xy__
                mov r281.xy__, v0
                call 38
                call 0
                endmain
                func 0
                mov o0, r279
                mov o1, r280
                ret
                func 11
                ieq r0.x___, r62.x000, l0.x000
                if_logicalnz r0.x000
                sample_resource(0)_sampler(0) r64, r63.xy00
                endif
                ieq r0.x___, r62.x000, l1.x000
                if_logicalnz r0.x000
                sample_resource(1)_sampler(0) r64, r63.xy00
                endif
                mov r61, r64
                ret_dyn
                ret
                func 37
                mov r294.x___, r269.y000
                add r273.x___, r269.x000, r294.x000
                mov r295.x___, r269.z000
                add r274.x___, r273.x000, r295.x000
                mov r296.x___, r269.w000
                add r275.x___, r274.x000, r296.x000
                mov r271.x___, r275.x000
                mov r297.x___, r270.y000
                add r276.x___, r270.x000, r297.x000
                mov r298.x___, r270.z000
                add r277.x___, r276.x000, r298.x000
                mov r299.x___, r270.w000
                add r278.x___, r277.x000, r299.x000
                mov r272.x___, r278.x000
                ret
                func 38
                mov r288.xy__, r281.xy00
                mov r62.x___, l0.x000
                mov r63.xy__, r288.xy00
                call 11
                mov r300, r61
                mov r284, r300
                mov r289.xy__, r281.xy00
                mov r62.x___, l1.x000
                mov r63.xy__, r289.xy00
                call 11
                mov r301, r61
                mov r285, r301
                mov r269, r284
                mov r270, r285
                call 37
                mov r286.x___, r271.x000
                mov r287.x___, r272.x000
                mov r290.x___, r286.x000
                mov r290._y__, l0.0x00
                mov r290.__z_, l0.00x0
                mov r290.___w, l0.000x
                mov r282, r290
                mov r291.x___, r287.x000
                mov r291._y__, l0.0x00
                mov r291.__z_, l0.00x0
                mov r291.___w, l0.000x
                mov r283, r291
                mov r279, r282
                mov r280, r283
                ret_dyn
                ret
                end


                il_ps_2_0
                ; l0 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l0, 0x00000000, 0x00000000, 0x00000000, 0x00000000
                ; l1 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l1, 0x00000001, 0x00000001, 0x00000001, 0x00000001
                ; l2 = (-1.#QNAN0f -1.#QNAN0f -1.#QNAN0f -1.#QNAN0f)
                dcl_literal l2, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
                ; l3 = (1.#QNAN0f 1.#QNAN0f 1.#QNAN0f 1.#QNAN0f)
                dcl_literal l3, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
                ; l4 = (1.#INF00f 1.#INF00f 1.#INF00f 1.#INF00f)
                dcl_literal l4, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000
                ; l5 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l5, 0x80000000, 0x80000000, 0x80000000, 0x80000000
                ; l6 = (0.301030f 0.301030f 0.301030f 0.301030f)
                dcl_literal l6, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B
                ; l7 = (0.693147f 0.693147f 0.693147f 0.693147f)
                dcl_literal l7, 0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218
                ; l8 = (3.141593f 3.141593f 3.141593f 3.141593f)
                dcl_literal l8, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
                ; l9 = (1.570796f 1.570796f 1.570796f 1.570796f)
                dcl_literal l9, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB
                ; l10 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l10, 0x00000003, 0x00000003, 0x00000003, 0x00000003
                ; l11 = (0.000000f 0.000000f 0.000000f 0.000000f)
                dcl_literal l11, 0x00000002, 0x00000002, 0x00000002, 0x00000002
                dcl_output_generic o0
                dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
                dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
                dcl_input_position_interp(linear_noperspective) v0.xy__
                mov r280.xy__, v0
                call 38
                call 0
                endmain
                func 0
                mov o0, r279
                ret
                func 11
                ieq r0.x___, r62.x000, l0.x000
                if_logicalnz r0.x000
                sample_resource(0)_sampler(0) r64, r63.xy00
                endif
                ieq r0.x___, r62.x000, l1.x000
                if_logicalnz r0.x000
                sample_resource(1)_sampler(0) r64, r63.xy00
                endif
                mov r61, r64
                ret_dyn
                ret
                func 37
                mov r288.x___, r269.y000
                add r273.x___, r269.x000, r288.x000
                mov r289.x___, r269.z000
                add r274.x___, r273.x000, r289.x000
                mov r290.x___, r269.w000
                add r275.x___, r274.x000, r290.x000
                mov r272.x___, r275.x000
                mov r291.x___, r270.y000
                add r276.x___, r270.x000, r291.x000
                mov r292.x___, r270.z000
                add r277.x___, r276.x000, r292.x000
                mov r293.x___, r270.w000
                add r278.x___, r277.x000, r293.x000
                mov r272._y__, r278.0x00
                mov r271, r272
                ret
                func 38
                mov r285.xy__, r280.xy00
                mov r62.x___, l0.x000
                mov r63.xy__, r285.xy00
                call 11
                mov r294, r61
                mov r282, r294
                mov r286.xy__, r280.xy00
                mov r62.x___, l1.x000
                mov r63.xy__, r286.xy00
                call 11
                mov r295, r61
                mov r283, r295
                mov r269, r282
                mov r270, r283
                call 37
                mov r284, r271
                mov r281, r284
                mov r279, r281
                ret_dyn
                ret
                end


                respectively.
              • Another Performance Question
                MicahVillmow
                If you look at the resulting ISA, you can see that there is a BURSTCNT(1) on the first kernel. This allows the kernel to push the data out to the memory at a higher rate than a single write which the second kernel does. This might be the reason why.
                • Another Performance Question
                  MicahVillmow
                  The first kernel has two outputs and the compiler is combining the outputs to a single write of twice the length but the second one only has a single write, so it is impossible to burst. Also the extra ALU might be helping allowing more of the latency to be hidden.
                    • Another Performance Question
                      ryta1203
                      Micah,

                      Ok, so I now have these two kernels:

                      kernel void step1(float4 a<>, float4 b<>, out float c<>, out float d<>)
                      {
                      c = a.x + a.y + a.z + a.w;
                      d = b.x + b.y + b.z + b.w;
                      }

                      kernel void step2(float4 a<>, float4 b<>, out float4 out1<>, out float4 out2<>)
                      {
                      //float4 temp;
                      out1.x = a.x + a.y + a.z + a.w;
                      out2.x = b.x + b.y + b.z + b.w;
                      //out1 = temp;
                      //out2 = temp;
                      }

                      I have tried step2 both way, using temp as in intermediate and not (shown above). They both have a Burst Count of 1 according to the ISA in KSA. They both have all of the exact same attributes according to the KSA, same ALU:Fetch, same GPR, same Avg Cycles, Threads/Clock, Throughput, CF, ALU, TEX, etc, etc...

                      The 1st kernel runs almost twice as fast (6.5s) as the second kernel (11.5). The ISAs have the same number of cycles count.

                      My only guess is that this is a memory issue, and I'm trying a little harder to understand the memory model in these GPUs.
                    • Another Performance Question
                      MicahVillmow
                      Ryta,
                      Writing out the float should be faster overall because less data is being written to the memory. However, on a per-element basis, the second kernel writes out data faster. It is writing out 4x the amount of data but is only half the speed, so that gives that you are writing out twice as fast. This is drastically over simplifying it, but explains why you are seeing the discrepancies.
                        • Another Performance Question
                          ryta1203
                          Micah,

                          Thanks again. This helps. In case anyone is wondering or reading this thread, I ran these two kernels:

                          kernel void step1(float4 a<>, float4 b<>, out float f1<>, out float f2<>, out float f3<>, out float f4<>,
                          out float f5<>, out float f6<>, out float f7<>, out float f8<>)
                          {
                          f1 = a.x + a.y + a.z + a.w;
                          f2 = b.x + b.y + b.z + b.w;
                          f3 = a.x - a.y - a.z - a.w;
                          f4 = b.x - b.y - b.z - b.w;
                          f5 = a.x + a.y - a.z - a.w;
                          f6 = b.x + b.y - b.z - b.w;
                          f7 = a.x - a.y + a.z + a.w;
                          f8 = b.x - b.y + b.z + b.w;
                          }

                          kernel void step2(float4 a<>, float4 b<>, out float4 out1<>, out float4 out2<>)
                          {
                          out1.x = a.x + a.y + a.z + a.w;
                          out1.y = b.x + b.y + b.z + b.w;
                          out1.z = a.x - a.y - a.z - a.w;
                          out1.w = b.x - b.y - b.z - b.w;
                          out1.x = a.x + a.y - a.z - a.w;
                          out1.y = b.x + b.y + b.z + b.w;
                          out1.z = a.x - a.y + a.z + a.w;
                          out1.w = b.x - b.y + b.z + b.w;
                          }

                          Both writing out the same equations to EIGHT outputs. The 2nd kernel was much faster, but a ratio of 20s:11s (1st:2nd). I did not include any streamWrites in my timing.

                          EDIT: Although, I think some of this could be attributed to the number of wavefronts in the run queue, since the 1st kernel GPR is 17 and the second is 4. IS THAT AN ACCURATE STATEMENT?
                            • Another Performance Question
                              ryta1203
                              Sorry, the 2nd kernel should be:

                              kernel void step2(float4 a<>, float4 b<>, out float4 out1<>, out float4 out2<>)
                              {
                              out1.x = a.x + a.y + a.z + a.w;
                              out1.y = b.x + b.y + b.z + b.w;
                              out1.z = a.x - a.y - a.z - a.w;
                              out1.w = b.x - b.y - b.z - b.w;
                              out2.x = a.x + a.y - a.z - a.w;
                              out2.y = b.x + b.y + b.z + b.w;
                              out2.z = a.x - a.y + a.z + a.w;
                              out2.w = b.x - b.y + b.z + b.w;
                              }

                              My first post were all out1 = , this was a typo, my apologies.