3 Replies Latest reply on Dec 16, 2011 4:33 AM by gaddo

    help understanding the generated code

    gaddo

      Hello,

      i am experimenting with the OpenCL compiler with the CPU target.

      i am trying to get the compiler to generate a movq

      with this kernel , the code that i get 

      (I am using the wonderful AMD APP KernelAnalyzer 1.10.1149)

      __kernel void move(__global  uchar * outputv,
                         __global  uchar * inputv)
      {
          uint tid = get_global_id(0) * 8;
          uchar8 i = vload8(tid, inputv);
          vstore8(i, tid, outputv);
      }

          .def     ___OpenCL_move_kernel;
          .scl    2;
          .type    32;
          .endef
          .globl    ___OpenCL_move_kernel
          .align    16, 0x90
      ___OpenCL_move_kernel:                  # @__OpenCL_move_kernel
      # BB#0:                                 # %entry
          pushl    %ebx
          pushl    %esi
          subl    $8, %esp
          leal    8195(%esp), %eax
          movl    24(%esp), %ecx
          movl    20(%esp), %esi
          andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
          movl    -16(%eax), %eax
          shll    $6, %eax
          movb    7(%ecx,%eax), %dl
          movb    4(%ecx,%eax), %dh
          movb    3(%ecx,%eax), %bl
          movb    2(%ecx,%eax), %bh
          movb    %dl, 3(%esp)            # 1-byte Spill
          movb    6(%ecx,%eax), %dl
          movb    %dl, 2(%esp)            # 1-byte Spill
          movb    5(%ecx,%eax), %dl
          movb    %dl, 1(%esp)            # 1-byte Spill
          movb    (%ecx,%eax), %dl
          movb    1(%ecx,%eax), %cl
          movb    2(%esp), %ch            # 1-byte Reload
          movb    %dl, (%esi,%eax)
          movb    %cl, 1(%esi,%eax)
          movb    1(%esp), %cl            # 1-byte Reload
          movb    %bh, 2(%esi,%eax)
          movb    %bl, 3(%esi,%eax)
          movb    %dh, 4(%esi,%eax)
          movb    %cl, 5(%esi,%eax)
          movb    3(%esp), %cl            # 1-byte Reload
          movb    %ch, 6(%esi,%eax)
          movb    %cl, 7(%esi,%eax)
          addl    $8, %esp
          popl    %esi
          popl    %ebx
          ret

      so the vload is loading and storing 1byte at the time :(

      with this kernel

      __kernel void move2(__global  uchar8 * outputv,
                         __global  uchar8 * inputv)
      {
          uint tid = get_global_id(0) * 8;
          uchar8 i = inputv[tid];
          outputv[tid] = i;
      }

      the generated code is

          .def     ___OpenCL_move2_kernel;
          .scl    2;
          .type    32;
          .endef
          .globl    ___OpenCL_move2_kernel
          .align    16, 0x90
      ___OpenCL_move2_kernel:                 # @__OpenCL_move2_kernel
      # BB#0:                                 # %entry
          pushl    %eax
          leal    8191(%esp), %eax
          movl    12(%esp), %ecx
          movl    8(%esp), %edx
          andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
          movl    -16(%eax), %eax
          shll    $6, %eax
          movsd    (%ecx,%eax), %xmm0
          movsd    %xmm0, (%edx,%eax)
          popl    %eax
          ret

          .def     ___OpenCL_move2_stub;
          .scl    2;
          .type    32;
          .endef
          .globl    ___OpenCL_move2_stub
          .align    16, 0x90
      ___OpenCL_move2_stub:                   # @__OpenCL_move2_stub
      # BB#0:                                 # %entry
          pushl    %eax
          leal    8191(%esp), %edx
          movl    8(%esp), %eax
          andl    $-8192, %edx            # imm = 0xFFFFFFFFFFFFE000
          movl    -16(%edx), %edx
          movl    (%eax), %ecx
          movl    4(%eax), %eax
          shll    $6, %edx
          movsd    (%eax,%edx), %xmm0
          movsd    %xmm0, (%ecx,%edx)
          popl    %eax
          ret

      so the move is done with the integer register 4 bytes at the time. Better, but not perfect.

      any idea ?

      another question:

      in the generated code i see two implementations:

      ___OpenCL_move2_kernel

      and

      ___OpenCL_move2_stub

      why ?

      thank you, ...

       

       

       

        • help understanding the generated code
          gaddo

          of course i am a fool: the move2 kernel actually use a a mvsd

          but with this kernel

           

          __kernel void move2(__global  uchar8 * outputv,
                             __global  uchar8 * inputv)
          {
              uint tid = get_global_id(0) * 8;
              uchar8 i = inputv[tid];
              outputv[tid] = i*2;
          }

          the generate code is

              .def     ___OpenCL_move2_kernel;
              .scl    2;
              .type    32;
              .endef
              .globl    ___OpenCL_move2_kernel
              .align    16, 0x90
          ___OpenCL_move2_kernel:                 # @__OpenCL_move2_kernel
          # BB#0:                                 # %entry
              pushl    %ebx
              pushl    %esi
              pushl    %eax
              leal    8191(%esp), %eax
              movl    20(%esp), %ecx
              andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
              movl    -16(%eax), %eax
              shll    $6, %eax
              movd    (%ecx,%eax), %xmm0
              movd    4(%ecx,%eax), %xmm1
              movlhps    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0]
              shufps    $-30, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
              pxor    %xmm0, %xmm0
              pextrw    $1, %xmm1, %ecx
              movd    %xmm1, %ebx
              movl    %ecx, %edx
              addb    %cl, %cl
              addb    %bl, %bl
              andl    $65280, %edx            # imm = 0xFF00
              movzbl    %cl, %ecx
              movzbl    %bl, %esi
              addl    %edx, %edx
              orl    %edx, %ecx
              pextrw    $0, %xmm1, %edx
              andl    $65280, %edx            # imm = 0xFF00
              addl    %edx, %edx
              orl    %edx, %esi
              pinsrw    $0, %esi, %xmm0
              pinsrw    $1, %ecx, %xmm0
              pextrw    $2, %xmm1, %ecx
              movl    %ecx, %edx
              addb    %cl, %cl
              andl    $65280, %edx            # imm = 0xFF00
              movzbl    %cl, %ecx
              addl    %edx, %edx
              orl    %edx, %ecx
              pinsrw    $2, %ecx, %xmm0
              pextrw    $3, %xmm1, %ecx
              movl    %ecx, %edx
              addb    %cl, %cl
              andl    $65280, %edx            # imm = 0xFF00
              movzbl    %cl, %ecx
              addl    %edx, %edx
              orl    %edx, %ecx
              movl    16(%esp), %edx
              pinsrw    $3, %ecx, %xmm0
              movd    %xmm0, (%edx,%eax)
              pshufd    $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
              movd    %xmm0, 4(%edx,%eax)
              addl    $4, %esp
              popl    %esi
              popl    %ebx
              ret

          :(