Is there possibility to optimize (re-write) OpenCL kernel/func in CAL (IL)?
Trying to use bin tools clc llc,but from trivial kernel got IL with a lot of "unnecessary" instructions .
For instance trivial kernel:
__kernel void k0(__global unsigned int * output, __global unsigned int * input)
{
uint tid = get_global_id(0);
output[tid] = input[tid] + 7;
}
after llvm backend a lot of stuff (what's this obfuscation?):
il_cs_2_0
dcl_cb cb0[9] ; Constant buffer that holds ABI data
dcl_literal l0, 4, 1, 2, 3
dcl_literal l1, 0x00FFFFFF, -1, -2, -3
dcl_literal l2, 0x0000FFFF, 0xFFFFFFFE,0x000000FF,0xFFFFFFFC
dcl_literal l3, 24, 16, 8, 0xFFFFFFFF
dcl_literal l4, 0xFFFFFF00, 0xFFFF0000, 0xFF00FFFF, 0xFFFF00FF
dcl_literal l5, 0, 4, 8, 12
dcl_literal l6, 32, 32, 32, 32
mov r769, cb0[8].x
;$$$$$$$$$$
endmain
func 1204 ; __OpenCL_k0_kernel
mov r770, l1.0
dcl_literal l7, 0x0000000c, 0x0000000c, 0x0000000c, 0x0000000c; int: 12
dcl_literal l8, 0x00000008, 0x00000008, 0x00000008, 0x00000008; int: 8
dcl_literal l9, 0x00000002, 0x00000002, 0x00000002, 0x00000002; int: 2
dcl_literal l10, 0x00000000, 0x00000000, 0x00000000, 0x00000000; int: 0
dcl_literal l11, 0xfffffffd, 0xfffffffd, 0xfffffffd, 0xfffffffd; int: 4294967293
dcl_literal l12, 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe; int: 4294967294
dcl_literal l13, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff; int: 4294967295
dcl_literal l14, 0x00000004, 0x00000004, 0x00000004, 0x00000004; int: 4
dcl_literal l15, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0; int: 4294967280
dcl_literal l16, 0x00000007, 0x00000007, 0x00000007, 0x00000007; int: 7
dcl_num_thread_per_group 64, 1, 1
dcl_raw_uav_id(1)
mov r0.z, vThreadGrpIdFlat.x
mov r1022.xyz0, vTidInGrp.xyz
imul r0.w, cb0[2].x, cb0[2].y
umod r1023.x, r0.z, cb0[2].x
udiv r1023.y, r0.z, cb0[2].x
umod r1023.y, r1023.y, cb0[2].y
udiv r1023.z, r0.z, r0.w
imad r1021.xyz0, r1023.xyz0, cb0[1].xyz0, r1022.xyz0
iadd r1021.xyz0, r1021.xyz0, cb0[6].xyz0
iadd r1023.xyz0, r1023.xyz0, cb0[7].xyz0
mov r1023.w, r0.z
imad r772.x, r1023.w, cb0[4].y, cb0[4].x
ishl r1023.w, r1023.w, l0.z
mov r771.x, l0.0
dcl_cb cb1[2]
dcl_indexed_temp_array x0[2]
mov x0[0], r0.0
mov x0[1], r0.0
; Kernel arg setup: output
mov r1, cb1[0]
; Kernel arg setup: input
mov r2, cb1[1]
call 1205 ; k0
ret
endfunc ; __OpenCL_k0_kernel
;ARGSTART:__OpenCL_k0_kernel
;version:1:4:50
;uniqueid:1204
;memory:hwprivate:32
;memory:local:0
;pointerutput:i32:1:1:0:uav:1
;pointer:input:i32:1:1:16:uav:1
;function:1:1205
;intrinsic:0
;ARGEND:__OpenCL_k0_kernel
func 1205 ; k0
mov r176.x___, r2.xxxx
mov r177.x___, r1.xxxx
mov r178, l7
mov r1094, l8
mov r179.x___, r1094.xxxx
and r178.x___, r179.xxxx, r178.xxxx
mov r181, l9
ushr r178.x___, r178.xxxx, r181.xxxx
mov r182, l10
mov r183, r178.xxxx
mov r184, r182.xxxx
mov r185, l11
iadd r183, r183.xyz0, r178.000x
iadd r185, r184.xyz0, r185.000x
mov r186, l12
iadd r183, r183.xy0w, r178.00x0
iadd r184, r184.xyz0, r182.000x
iadd r185, r185.xy0w, r186.00x0
mov r186, l13
iadd r184, r184.xy0w, r182.00x0
iadd r178, r183.x0zw, r178.0x00
iadd r183, r185.x0zw, r186.0x00
mov r1095, l10
mov r2.x___, r177.xxxx
mov r1.x___, r1095.xxxx
call 1061 ; get32BitStorePrivate
iadd r177, r178, r183
iadd r178, r184.x0zw, r182.0x00
mov r1096, l14
mov r2.x___, r176.xxxx
mov r1.x___, r1096.xxxx
call 1061 ; get32BitStorePrivate
mov r176, r177.x000
mov r183, r178.x000
mov r1.x___, r182.xxxx
call 1027 ; get_global_id
mov r182.x___, r1.xxxx
mov r176, r176.xxxx
mov r184, r177.y000
mov r183, r183.xxxx
mov r185, r178.y000
iadd r176, r176.x0zw, r184.0x00
iadd r183, r183.x0zw, r185.0x00
mov r184, r177.z000
mov r185, r178.z000
mov r186, l15
mov r1097, l8
mov r2.x___, r182.xxxx
mov r1.x___, r1097.xxxx
call 1061 ; get32BitStorePrivate
and r179.x___, r179.xxxx, r186.xxxx
mov r182, r176.x000
mov r186, r183.x000
mov r184, r184.xxxx
mov r177, r177.w000
mov r185, r185.xxxx
mov r178, r178.w000
mov r1.x___, r179.xxxx
call 1066 ; get128BitLoadPrivate
mov r179, r1
ieq r182.x___, r182.xxxx, r186.xxxx
iadd r177, r184.x0zw, r177.0x00
iadd r178, r185.x0zw, r178.0x00
mov r176, r176.y000
mov r183, r183.y000
ieq r176.x___, r176.xxxx, r183.xxxx
mov r177, r177.x000
mov r178, r178.x000
mov r183, r179.w000
mov r184, r179.x000
ieq r177.x___, r177.xxxx, r178.xxxx
cmov_logical r178.x___, r182.xxxx, r184.xxxx, r183.xxxx
mov r182, r179.y000
cmov_logical r176.x___, r176.xxxx, r182.xxxx, r178.xxxx
mov r178, r179.z000
cmov_logical r176.x___, r177.xxxx, r178.xxxx, r176.xxxx
ishl r176.x___, r176.xxxx, r181.xxxx
mov r1098, l14
mov r1.x___, r1098.xxxx
call 1064 ; get32BitLoadPrivate
mov r177.x___, r1.xxxx
iadd r177.x___, r177.xxxx, r176.xxxx
mov r1099, l10
mov r1.x___, r1099.xxxx
call 1064 ; get32BitLoadPrivate
mov r178.x___, r1.xxxx
mov r1.x___, r177.xxxx
call 1083 ; get32BitLoadUAV
mov r177.x___, r1.xxxx
mov r179, l16
iadd r177.x___, r177.xxxx, r179.xxxx
iadd r176.x___, r178.xxxx, r176.xxxx
mov r2.x___, r177.xxxx
mov r1.x___, r176.xxxx
call 1078 ; get32BitStoreUAV
ret
endfunc ; k0
;ARGSTART:k0
;uniqueid:1205
;memory:hwprivate:0
;memory:local:0
;function:6:1027:1061:1064:1066:1078:1083
;intrinsic:0
;ARGEND:k0
func 1027 ; get_global_id
iadd r1020, r1.xxxx, l1.0yzw
ieq r1020, r1020, l0.0000
cmov_logical r1.x, r1020.x, r1021.x, r1021.0
cmov_logical r1.x, r1020.y, r1021.y, r1.x
cmov_logical r1.x, r1020.z, r1021.z, r1.x
ret
endfunc ; get_global_id
;ARGSTART:compiler_intrinsic
;uniqueid:1027
;ARGEND:compiler_intrinsic
func 1061 ; Store32BitsPrivate
ishr r1.x, r1.x, l0.z
and r1.w, r1.x, l0.w
ishr r1.x, r1.x, l0.z
switch r1.w
default
mov x0[r1.x].x___, r2.x
break
case 1
mov x0[r1.x]._y__, r2.x
break
case 2
mov x0[r1.x].__z_, r2.x
break
case 3
mov x0[r1.x].___w, r2.x
break
endswitch
ret
endfunc ; Store32BitsPrivate
;ARGSTART:compiler_intrinsic
;uniqueid:1061
;ARGEND:compiler_intrinsic
func 1064 ; Load32BitsPrivate
ishr r1.x, r1.x, l0.z
and r1.w, r1.x, l0.w
ishr r1.x, r1.x, l0.z
iadd r1020, r1.wwww, l1.0yzw
ieq r1020, r1020, l0.0000
mov r1, x0[r1.x]
cmov_logical r1.x, r1020.y, r1.y, r1.x
cmov_logical r1.x, r1020.z, r1.z, r1.x
cmov_logical r1.x, r1020.w, r1.w, r1.x
ret
endfunc ; Load32BitsPrivate
;ARGSTART:compiler_intrinsic
;uniqueid:1064
;ARGEND:compiler_intrinsic
func 1066 ; Load128BitsPrivate
ishr r1.x, r1.x, l0.x
mov r1, x0[r1.x]
ret
endfunc ; Load128BitsPrivate
;ARGSTART:compiler_intrinsic
;uniqueid:1066
;ARGEND:compiler_intrinsic
func 1078 ; Store32BitsUAV
uav_raw_store_id(1) mem0.x___, r1.x, r2
ret
endfunc ; Store32BitsUAV
;ARGSTART:compiler_intrinsic
;uniqueid:1078
;ARGEND:compiler_intrinsic
func 1083 ; Load32BitsUAV
uav_raw_load_id(1) r1.x___, r1.x
ret
endfunc ; Load32BitsUAV
;ARGSTART:compiler_intrinsic
;uniqueid:1083
;ARGEND:compiler_intrinsic
end
I'm not sure about your particular example, but just as a general guideline: IL != ISA instructions
I have seen a lot of long IL kernels that when compiled end up in fairly short ISA.
So, what I'm asking is, what does the ISA look like?