8 Replies Latest reply on Jul 29, 2008 10:39 PM by macdonc

    Compare instruction and breaking from loop issue

    macdonc

      I have this simple add loop which transforms a list using a +1 array index.

      For a input list of 1,2,3,4,5,6,7,....

      The output should be 2,3,4,5,6,7,1,....

      If I use ilt op it works correctly, however if I use the ieq op it does not.

      I get 2,3,4,5,6,7,7,......

      What am I doing wrong?

      Thanks,

      Craig

       

      "il_ps_2_0\n"

      " dcl_literal l0, 0,0,0,0\n"
      " dcl_literal l1, 1,0,0,0\n"
      " dcl_literal l2, 6,0,0,0\n"

      "mov r1, l0\n"
      "whileloop\n"

      "ilt r3, r1, l2\n"
      "if_logicalz r3\n"
      " mov r2, l0\n"
      "else\n"
      " mov r2, l1\n"
      "endif\n"

      "ieq r3, r2, l1\n"
      //"ilt r3, r1, l2\n"
      " break_logicalz r3\n"

      " mov r4, g[r1.x]\n"
      " iadd r5, r1, l1\n"
      " mov g[r1.x], g[r5.x]\n"
      " mov g[r5.x], r4\n"

      " iadd r1, r1, l1\n"

      "endloop\n"

      "ret_dyn\n"
      "end\n"

       

        • Compare instruction and breaking from loop issue
          MicahVillmow
          macdonc,
          The problem is coming from your break instruction.
          for example, use the case where the loop is in the 3rd iteration.
          r1 < l2, so r3 = true
          at the if statement, you are setting r2 to 0
          at the ieq, r2 != l1, so R3 == false(or zero)
          then you are saying, break if r3 is zero, so you break early
          Whereas if you had r2 < l1, then R3 would be true and would not break.

          Hope this helps.

          Also, there is no guarantee on the validity of your data in this example. You are swapping the locations of r1.x and r5.x, but this would only work if you were running a single thread on the GPU. Once multiple threads are running on the GPU, then there is no guarantee on which thread is updating that memory location at which time.
          • Compare instruction and breaking from loop issue
            MicahVillmow
            Ahh yeah, you are right. I've done some testing and it seems that at least for my simple test case using either ieq or ilt are equivalent.
            Whether I use ieq or ilt, the same values are being placed in the global buffer at (loopcount * 10 + offset).

            On another note, if you are only using a single thread, then there isn't much of a point of using a GPU as a CPU's single thread performance is many times better. The GPU is great at massively parallel computations.

            The kernel I used is as follows:
            "il_ps_2_0\n"

            " dcl_literal l0, 0,0,0,0\n"
            " dcl_literal l1, 1,0,0,0\n"
            " dcl_literal l2, 6,0,0,0\n"
            " dcl_literal l3, 10, 10, 10, 10\n"
            "mov r1, l0\n"
            "mov r10, l3\n"
            "whileloop\n"

            "ilt r3, r1, l2\n"
            "mov g[r10.x + 1], r3\n"
            "if_logicalz r3\n"
            " mov r2, l0\n"
            "mov g[r10.x + 2], r2\n"
            "else\n"
            " mov r2, l1\n"
            "mov g[r10.x + 3], r2\n"
            "endif\n"

            "ieq r3, r2, l1\n"
            //"ilt r3, r1, l2\n"
            "mov g[r10.x + 4], r3\n"
            " break_logicalz r3.x\n"
            "mov g[r10.x + 5], r3.0\n"
            "mov g[r10.x + 6], l3\n"
            " mov r4, g[r1.x]\n"
            " iadd r5, r1, l1\n"
            " mov g[r1.x], g[r5.x]\n"
            " mov g[r5.x], r4\n"

            " iadd r1, r1, l1\n"
            "mov g[r10.x + 9], r1\n"
            "iadd r10, r10, l3\n"
            "endloop\n"

            "ret_dyn\n"
            "end\n"
              • Compare instruction and breaking from loop issue
                macdonc

                If I comment out some of your move g[r10] statements,
                I get different results.

                orig code
                2,3,4,5,6,7,7

                with new comments
                2,3,4,5,6,7,1 (ilt) correct
                2,3,4,5,6,7,7 (ieq) incorrect

                Yeah your are right about the single thread issue. I am just learning
                the isa.

                Thanks for your help.

                Craig

                "il_ps_2_0\n"

                " dcl_literal l0, 0,0,0,0\n"
                " dcl_literal l1, 1,0,0,0\n"
                " dcl_literal l2, 6,0,0,0\n"
                " dcl_literal l3, 10, 10, 10, 10\n"
                "mov r1, l0\n"
                "mov r10, l3\n"
                "whileloop\n"

                "ilt r3, r1, l2\n"
                //"mov g[r10.x + 1], r3\n"
                "if_logicalz r3\n"
                " mov r2, l0\n"
                //"mov g[r10.x + 2], r2\n"
                "else\n"
                " mov r2, l1\n"
                //"mov g[r10.x + 3], r2\n"
                "endif\n"

                "ieq r3, r2, l1\n"
                "ilt r3, r1, l2\n"
                //"mov g[r10.x + 4], r3\n"
                " break_logicalz r3.x\n"
                "mov g[r10.x + 5], r3.0\n"
                "mov g[r10.x + 6], l3\n"
                " mov r4, g[r1.x]\n"
                " iadd r5, r1, l1\n"
                " mov g[r1.x], g[r5.x]\n"
                " mov g[r5.x], r4\n"

                " iadd r1, r1, l1\n"
                "mov g[r10.x + 9], r1\n"
                "iadd r10, r10, l3\n"
                "endloop\n"

                "ret_dyn\n"
                "end\n"



                  • Compare instruction and breaking from loop issue
                    macdonc

                    If I copy the global to a temp array and do the operations on the temp array,
                    and copy them temp array to global. I get the correct results for both ilt and ieq.

                    Thanks
                    Craig


                    "il_ps_2_0\n"
                    "dcl_indexed_temp_array x0[48]\n"

                    " dcl_literal l0, 0,0,0,0\n"
                    " dcl_literal l1, 1,0,0,0\n"
                    " dcl_literal l2, 6,0,0,0\n"
                    " dcl_literal l3, 7,0,0,0\n"

                    "mov r1, l0\n"
                    "whileloop\n"
                    "ilt r3, r1, l3\n"
                    " break_logicalz r3.x\n"
                    "mov x0[r1.x], g[r1.x]\n"
                    " iadd r1, r1, l1\n"
                    "endloop\n"

                    "mov r1, l0\n"
                    "mov r10, l3\n"
                    "whileloop\n"

                    "ilt r3, r1, l2\n"
                    "if_logicalz r3\n"
                    " mov r2, l0\n"
                    "else\n"
                    " mov r2, l1\n"
                    "endif\n"

                    //"ieq r3, r2, l1\n"
                    "ilt r3, r1, l2\n"
                    " break_logicalz r3.x\n"

                    " mov r4, x0[r1.x]\n"
                    " iadd r5, r1, l1\n"
                    " mov x0[r1.x], x0[r5.x]\n"
                    " mov x0[r5.x], r4\n"

                    " iadd r1, r1, l1\n"

                    "endloop\n"

                    "mov r1, l0\n"
                    "whileloop\n"
                    "ilt r3, r1, l3\n"
                    " break_logicalz r3.x\n"
                    "mov g[r1.x], x0[r1.x]\n"
                    " iadd r1, r1, l1\n"
                    "endloop\n"

                    "ret_dyn\n"
                    "end\n"



                • Compare instruction and breaking from loop issue
                  MicahVillmow
                  macdonc,
                  The reason for your problem is one that is an architectural misunderstanding between how a CPU thread works and how a GPU thread works. Although you are telling CAL that you only want to run one thread, it is not possible to run a single thread in hardware. The reason for this is that in hardware the smallest size on the main graphic cards is 64 threads that can be run, i.e. a Hardware thread is equal to 64 software threads running in parallel executing the same instruction in lockstep. Since the address locations are static, there is a conflict between the reads and writes to the global buffer. Not sure what graphics card you are running, but on the HD3XXX series, there is no guarantee on validity of global buffer data accesses if more than a single software thread writes to the same location that another thread reads since there is no synchronization. This is why you see correct results when using the temp array and not the global buffer. The temp array is private to each software thread, whereas the global buffer is not.

                    • Compare instruction and breaking from loop issue
                      macdonc

                      Thanks for the information. There is a lot to learn with gpu programming.

                      This was only an example of the issue I am seeing.
                      The full example comes from the output from my hobby gpu c like compiler I am making.

                      The compiler "<" construct uses the if conditional as in the example il code.
                      In this case the "<" construct does not work correctly. In other cases it is fine.

                      If I change to use the "ilt" in instead of the "ieq" the compiled code works correctly.
                      " break_logicalz r15\n" -> ***** if changed to r14 "ilt" then works correctly ****
                      See below.

                      Again thanks for your help.

                      Craig

                      test(int rv<> out, int pos index)
                      {
                      int i;
                      int j;
                      int t;
                      int t1;
                      int t2;
                      int e[12];

                      i = 0;
                      while(i<12) {
                      t1 = i + 1;
                      e = t1;
                      i = i + 1;
                      }

                      i = 0;
                      while(i < 1) {
                      j = 0;
                      while(j < 6) {
                      t1 = j + 1;
                      t2 = i + j;
                      t = e[t2];
                      e[j] = e[t1];
                      e[t1] = t;

                      j = j + 1;
                      }

                      i = i + 1;
                      }

                      rv = e[pos];
                      }


                      "il_ps_2_0\n"
                      "dcl_output o0\n"
                      "dcl_input_position_interp(linear_noperspective) v0\n"
                      "ftoi v0, v0\n"
                      "dcl_indexed_temp_array x0[12]\n"
                      " dcl_literal l0, 0,0,0,0\n"
                      " mov r0, l0\n"
                      "whileloop\n"
                      " dcl_literal l1, 0,0,0,0\n"
                      " mov r5, l1\n"
                      " dcl_literal l2, 12,0,0,0\n"
                      " ilt r8, r0, l2\n"
                      " if_logicalnz r8\n"
                      " dcl_literal l3, 1,0,0,0\n"
                      " mov r5, l3\n"
                      "endif\n"
                      " dcl_literal l4, 1,0,0,0\n"
                      " ieq r9, r5, l4\n"
                      " break_logicalz r9\n"
                      " dcl_literal l5, 1,0,0,0\n"
                      " iadd r10, r0, l5\n"
                      " mov r3, r10\n"
                      " mov x0[r0.x], r3\n"
                      " dcl_literal l6, 1,0,0,0\n"
                      " iadd r11, r0, l6\n"
                      " mov r0, r11\n"
                      "endloop\n"
                      " dcl_literal l7, 0,0,0,0\n"
                      " mov r0, l7\n"
                      "whileloop\n"
                      " dcl_literal l8, 0,0,0,0\n"
                      " mov r6, l8\n"
                      " dcl_literal l9, 1,0,0,0\n"
                      " ilt r12, r0, l9\n"
                      " if_logicalnz r12\n"
                      " dcl_literal l10, 1,0,0,0\n"
                      " mov r6, l10\n"
                      "endif\n"
                      " dcl_literal l11, 1,0,0,0\n"
                      " ieq r13, r6, l11\n"
                      " break_logicalz r13\n"
                      " dcl_literal l12, 0,0,0,0\n"
                      " mov r1, l12\n"
                      "whileloop\n"
                      " dcl_literal l13, 0,0,0,0\n"
                      " mov r7, l13\n"
                      " dcl_literal l14, 6,0,0,0\n"
                      " ilt r14, r1, l14\n"
                      " if_logicalnz r14\n"
                      " dcl_literal l15, 1,0,0,0\n"
                      " mov r7, l15\n"
                      "endif\n"
                      " dcl_literal l16, 1,0,0,0\n"
                      " ieq r15, r7, l16\n"
                      " break_logicalz r15\n" -> ***** if changed to r14 "ilt" then works correctly ****
                      " dcl_literal l17, 1,0,0,0\n"
                      " iadd r16, r1, l17\n"
                      " mov r3, r16\n"
                      " iadd r17, r0, r1\n"
                      " mov r4, r17\n"
                      " mov r2, x0[r4.x]\n"
                      " mov x0[r1.x], x0[r3.x]\n"
                      " mov x0[r3.x], r2\n"
                      " dcl_literal l18, 1,0,0,0\n"
                      " iadd r18, r1, l18\n"
                      " mov r1, r18\n"
                      "endloop\n"
                      " dcl_literal l19, 1,0,0,0\n"
                      " iadd r19, r0, l19\n"
                      " mov r0, r19\n"
                      "endloop\n"
                      " mov o0, x0[v0.x]\n"
                      "ret_dyn\n"
                      "end\n"