2 Replies Latest reply on Jun 16, 2010 3:44 PM by MicahVillmow

    OpenCL kernel CAL optimizations, possible?

      OpenCL kernel CAL optimizations

      Is there possibility to optimize (re-write) OpenCL kernel/func in CAL (IL)?

      Trying to use bin tools clc llc,but from trivial kernel got IL with a lot of "unnecessary" instructions .

      For instance trivial kernel:

      __kernel void k0(__global  unsigned int * output, __global  unsigned int * input)
          uint tid = get_global_id(0);
          output[tid] = input[tid] + 7;

      after llvm backend a lot of stuff (what's this obfuscation?):

      dcl_cb cb0[9] ; Constant buffer that holds ABI data
      dcl_literal l0, 4, 1, 2, 3
      dcl_literal l1, 0x00FFFFFF, -1, -2, -3
      dcl_literal l2, 0x0000FFFF, 0xFFFFFFFE,0x000000FF,0xFFFFFFFC
      dcl_literal l3, 24, 16, 8, 0xFFFFFFFF
      dcl_literal l4, 0xFFFFFF00, 0xFFFF0000, 0xFF00FFFF, 0xFFFF00FF
      dcl_literal l5, 0, 4, 8, 12
      dcl_literal l6, 32, 32, 32, 32
      mov r769, cb0[8].x
      func 1204 ; __OpenCL_k0_kernel
      mov r770, l1.0
      dcl_literal l7, 0x0000000c, 0x0000000c, 0x0000000c, 0x0000000c; int: 12
      dcl_literal l8, 0x00000008, 0x00000008, 0x00000008, 0x00000008; int: 8
      dcl_literal l9, 0x00000002, 0x00000002, 0x00000002, 0x00000002; int: 2
      dcl_literal l10, 0x00000000, 0x00000000, 0x00000000, 0x00000000; int: 0
      dcl_literal l11, 0xfffffffd, 0xfffffffd, 0xfffffffd, 0xfffffffd; int: 4294967293
      dcl_literal l12, 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe; int: 4294967294
      dcl_literal l13, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff; int: 4294967295
      dcl_literal l14, 0x00000004, 0x00000004, 0x00000004, 0x00000004; int: 4
      dcl_literal l15, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0; int: 4294967280
      dcl_literal l16, 0x00000007, 0x00000007, 0x00000007, 0x00000007; int: 7
      dcl_num_thread_per_group 64, 1, 1             
      mov r0.z, vThreadGrpIdFlat.x
      mov r1022.xyz0, vTidInGrp.xyz
      imul r0.w, cb0[2].x, cb0[2].y
      umod r1023.x, r0.z, cb0[2].x
      udiv r1023.y, r0.z, cb0[2].x
      umod r1023.y, r1023.y, cb0[2].y
      udiv r1023.z, r0.z, r0.w
      imad r1021.xyz0, r1023.xyz0, cb0[1].xyz0, r1022.xyz0
      iadd r1021.xyz0, r1021.xyz0, cb0[6].xyz0
      iadd r1023.xyz0, r1023.xyz0, cb0[7].xyz0
      mov r1023.w, r0.z
      imad r772.x, r1023.w, cb0[4].y, cb0[4].x
      ishl r1023.w, r1023.w, l0.z
      mov r771.x, l0.0
      dcl_cb cb1[2]
      dcl_indexed_temp_array x0[2]
      mov x0[0], r0.0
      mov x0[1], r0.0
      ; Kernel arg setup: output
      mov r1, cb1[0]
      ; Kernel arg setup: input
      mov r2, cb1[1]
      call 1205 ; k0
      endfunc ; __OpenCL_k0_kernel
      func 1205 ; k0
          mov r176.x___, r2.xxxx
          mov r177.x___, r1.xxxx
          mov r178, l7
          mov r1094, l8
          mov r179.x___, r1094.xxxx
          and r178.x___, r179.xxxx, r178.xxxx
          mov r181, l9
          ushr r178.x___, r178.xxxx, r181.xxxx
          mov r182, l10
          mov r183, r178.xxxx
          mov r184, r182.xxxx
          mov r185, l11
          iadd r183, r183.xyz0, r178.000x
          iadd r185, r184.xyz0, r185.000x
          mov r186, l12
          iadd r183, r183.xy0w, r178.00x0
          iadd r184, r184.xyz0, r182.000x
          iadd r185, r185.xy0w, r186.00x0
          mov r186, l13
          iadd r184, r184.xy0w, r182.00x0
          iadd r178, r183.x0zw, r178.0x00
          iadd r183, r185.x0zw, r186.0x00
          mov r1095, l10
          mov r2.x___, r177.xxxx
          mov r1.x___, r1095.xxxx
          call 1061 ; get32BitStorePrivate
          iadd r177, r178, r183
          iadd r178, r184.x0zw, r182.0x00
          mov r1096, l14
          mov r2.x___, r176.xxxx
          mov r1.x___, r1096.xxxx
          call 1061 ; get32BitStorePrivate
          mov r176, r177.x000
          mov r183, r178.x000
          mov r1.x___, r182.xxxx
          call 1027 ; get_global_id
          mov r182.x___, r1.xxxx
          mov r176, r176.xxxx
          mov r184, r177.y000
          mov r183, r183.xxxx
          mov r185, r178.y000
          iadd r176, r176.x0zw, r184.0x00
          iadd r183, r183.x0zw, r185.0x00
          mov r184, r177.z000
          mov r185, r178.z000
          mov r186, l15
          mov r1097, l8
          mov r2.x___, r182.xxxx
          mov r1.x___, r1097.xxxx
          call 1061 ; get32BitStorePrivate
          and r179.x___, r179.xxxx, r186.xxxx
          mov r182, r176.x000
          mov r186, r183.x000
          mov r184, r184.xxxx
          mov r177, r177.w000
          mov r185, r185.xxxx
          mov r178, r178.w000
          mov r1.x___, r179.xxxx
          call 1066 ; get128BitLoadPrivate
          mov r179, r1
          ieq r182.x___, r182.xxxx, r186.xxxx
          iadd r177, r184.x0zw, r177.0x00
          iadd r178, r185.x0zw, r178.0x00
          mov r176, r176.y000
          mov r183, r183.y000
          ieq r176.x___, r176.xxxx, r183.xxxx
          mov r177, r177.x000
          mov r178, r178.x000
          mov r183, r179.w000
          mov r184, r179.x000
          ieq r177.x___, r177.xxxx, r178.xxxx
          cmov_logical r178.x___, r182.xxxx, r184.xxxx, r183.xxxx
          mov r182, r179.y000
          cmov_logical r176.x___, r176.xxxx, r182.xxxx, r178.xxxx
          mov r178, r179.z000
          cmov_logical r176.x___, r177.xxxx, r178.xxxx, r176.xxxx
          ishl r176.x___, r176.xxxx, r181.xxxx
          mov r1098, l14
          mov r1.x___, r1098.xxxx
          call 1064 ; get32BitLoadPrivate
          mov r177.x___, r1.xxxx
          iadd r177.x___, r177.xxxx, r176.xxxx
          mov r1099, l10
          mov r1.x___, r1099.xxxx
          call 1064 ; get32BitLoadPrivate
          mov r178.x___, r1.xxxx
          mov r1.x___, r177.xxxx
          call 1083 ; get32BitLoadUAV
          mov r177.x___, r1.xxxx
          mov r179, l16
          iadd r177.x___, r177.xxxx, r179.xxxx
          iadd r176.x___, r178.xxxx, r176.xxxx
          mov r2.x___, r177.xxxx
          mov r1.x___, r176.xxxx
          call 1078 ; get32BitStoreUAV
      endfunc ; k0
      func 1027 ; get_global_id
      iadd r1020, r1.xxxx, l1.0yzw
      ieq r1020, r1020, l0.0000
      cmov_logical r1.x, r1020.x, r1021.x, r1021.0
      cmov_logical r1.x, r1020.y, r1021.y, r1.x
      cmov_logical r1.x, r1020.z, r1021.z, r1.x
      endfunc ; get_global_id
      func 1061 ; Store32BitsPrivate
      ishr r1.x, r1.x, l0.z
      and r1.w, r1.x, l0.w
      ishr r1.x, r1.x, l0.z
      switch r1.w
          mov x0[r1.x].x___, r2.x
      case 1
          mov x0[r1.x]._y__, r2.x
      case 2
          mov x0[r1.x].__z_, r2.x
      case 3
          mov x0[r1.x].___w, r2.x
      endfunc ; Store32BitsPrivate
      func 1064 ; Load32BitsPrivate
      ishr r1.x, r1.x, l0.z
      and r1.w, r1.x, l0.w
      ishr r1.x, r1.x, l0.z
      iadd r1020, r1.wwww, l1.0yzw
      ieq r1020, r1020, l0.0000
      mov r1, x0[r1.x]
      cmov_logical r1.x, r1020.y, r1.y, r1.x
      cmov_logical r1.x, r1020.z, r1.z, r1.x
      cmov_logical r1.x, r1020.w, r1.w, r1.x
      endfunc ; Load32BitsPrivate
      func 1066 ; Load128BitsPrivate
      ishr r1.x, r1.x, l0.x
      mov r1, x0[r1.x]
      endfunc ; Load128BitsPrivate
      func 1078 ; Store32BitsUAV
      uav_raw_store_id(1) mem0.x___, r1.x, r2
      endfunc ; Store32BitsUAV
      func 1083 ; Load32BitsUAV
      uav_raw_load_id(1) r1.x___, r1.x
      endfunc ; Load32BitsUAV