cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

abaralgin
Journeyman III

OpenCL kernel CAL optimizations, possible?

OpenCL kernel CAL optimizations

Is there possibility to optimize (re-write) OpenCL kernel/func in CAL (IL)?

Trying to use bin tools clc llc,but from trivial kernel got IL with a lot of "unnecessary" instructions .

For instance trivial kernel:

__kernel void k0(__global  unsigned int * output, __global  unsigned int * input)
{
    uint tid = get_global_id(0);
    output[tid] = input[tid] + 7;
}

after llvm backend a lot of stuff (what's this obfuscation?):

il_cs_2_0
dcl_cb cb0[9] ; Constant buffer that holds ABI data
dcl_literal l0, 4, 1, 2, 3
dcl_literal l1, 0x00FFFFFF, -1, -2, -3
dcl_literal l2, 0x0000FFFF, 0xFFFFFFFE,0x000000FF,0xFFFFFFFC
dcl_literal l3, 24, 16, 8, 0xFFFFFFFF
dcl_literal l4, 0xFFFFFF00, 0xFFFF0000, 0xFF00FFFF, 0xFFFF00FF
dcl_literal l5, 0, 4, 8, 12
dcl_literal l6, 32, 32, 32, 32
mov r769, cb0[8].x
;$$$$$$$$$$
endmain
func 1204 ; __OpenCL_k0_kernel
mov r770, l1.0
dcl_literal l7, 0x0000000c, 0x0000000c, 0x0000000c, 0x0000000c; int: 12
dcl_literal l8, 0x00000008, 0x00000008, 0x00000008, 0x00000008; int: 8
dcl_literal l9, 0x00000002, 0x00000002, 0x00000002, 0x00000002; int: 2
dcl_literal l10, 0x00000000, 0x00000000, 0x00000000, 0x00000000; int: 0
dcl_literal l11, 0xfffffffd, 0xfffffffd, 0xfffffffd, 0xfffffffd; int: 4294967293
dcl_literal l12, 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe; int: 4294967294
dcl_literal l13, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff; int: 4294967295
dcl_literal l14, 0x00000004, 0x00000004, 0x00000004, 0x00000004; int: 4
dcl_literal l15, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0; int: 4294967280
dcl_literal l16, 0x00000007, 0x00000007, 0x00000007, 0x00000007; int: 7
dcl_num_thread_per_group 64, 1, 1             
dcl_raw_uav_id(1)
mov r0.z, vThreadGrpIdFlat.x
mov r1022.xyz0, vTidInGrp.xyz
imul r0.w, cb0[2].x, cb0[2].y
umod r1023.x, r0.z, cb0[2].x
udiv r1023.y, r0.z, cb0[2].x
umod r1023.y, r1023.y, cb0[2].y
udiv r1023.z, r0.z, r0.w
imad r1021.xyz0, r1023.xyz0, cb0[1].xyz0, r1022.xyz0
iadd r1021.xyz0, r1021.xyz0, cb0[6].xyz0
iadd r1023.xyz0, r1023.xyz0, cb0[7].xyz0
mov r1023.w, r0.z
imad r772.x, r1023.w, cb0[4].y, cb0[4].x
ishl r1023.w, r1023.w, l0.z
mov r771.x, l0.0
dcl_cb cb1[2]
dcl_indexed_temp_array x0[2]
mov x0[0], r0.0
mov x0[1], r0.0
; Kernel arg setup: output
mov r1, cb1[0]
; Kernel arg setup: input
mov r2, cb1[1]
call 1205 ; k0
ret
endfunc ; __OpenCL_k0_kernel
;ARGSTART:__OpenCL_k0_kernel
;version:1:4:50
;uniqueid:1204
;memory:hwprivate:32
;memory:local:0
;pointerutput:i32:1:1:0:uav:1
;pointer:input:i32:1:1:16:uav:1
;function:1:1205
;intrinsic:0
;ARGEND:__OpenCL_k0_kernel
func 1205 ; k0
    mov r176.x___, r2.xxxx
    mov r177.x___, r1.xxxx
    mov r178, l7
    mov r1094, l8
    mov r179.x___, r1094.xxxx
    and r178.x___, r179.xxxx, r178.xxxx
    mov r181, l9
    ushr r178.x___, r178.xxxx, r181.xxxx
    mov r182, l10
    mov r183, r178.xxxx
    mov r184, r182.xxxx
    mov r185, l11
    iadd r183, r183.xyz0, r178.000x
    iadd r185, r184.xyz0, r185.000x
    mov r186, l12
    iadd r183, r183.xy0w, r178.00x0
    iadd r184, r184.xyz0, r182.000x
    iadd r185, r185.xy0w, r186.00x0
    mov r186, l13
    iadd r184, r184.xy0w, r182.00x0
    iadd r178, r183.x0zw, r178.0x00
    iadd r183, r185.x0zw, r186.0x00
    mov r1095, l10
    mov r2.x___, r177.xxxx
    mov r1.x___, r1095.xxxx
    call 1061 ; get32BitStorePrivate
    iadd r177, r178, r183
    iadd r178, r184.x0zw, r182.0x00
    mov r1096, l14
    mov r2.x___, r176.xxxx
    mov r1.x___, r1096.xxxx
    call 1061 ; get32BitStorePrivate
    mov r176, r177.x000
    mov r183, r178.x000
    mov r1.x___, r182.xxxx
    call 1027 ; get_global_id
    mov r182.x___, r1.xxxx
    mov r176, r176.xxxx
    mov r184, r177.y000
    mov r183, r183.xxxx
    mov r185, r178.y000
    iadd r176, r176.x0zw, r184.0x00
    iadd r183, r183.x0zw, r185.0x00
    mov r184, r177.z000
    mov r185, r178.z000
    mov r186, l15
    mov r1097, l8
    mov r2.x___, r182.xxxx
    mov r1.x___, r1097.xxxx
    call 1061 ; get32BitStorePrivate
    and r179.x___, r179.xxxx, r186.xxxx
    mov r182, r176.x000
    mov r186, r183.x000
    mov r184, r184.xxxx
    mov r177, r177.w000
    mov r185, r185.xxxx
    mov r178, r178.w000
    mov r1.x___, r179.xxxx
    call 1066 ; get128BitLoadPrivate
    mov r179, r1
    ieq r182.x___, r182.xxxx, r186.xxxx
    iadd r177, r184.x0zw, r177.0x00
    iadd r178, r185.x0zw, r178.0x00
    mov r176, r176.y000
    mov r183, r183.y000
    ieq r176.x___, r176.xxxx, r183.xxxx
    mov r177, r177.x000
    mov r178, r178.x000
    mov r183, r179.w000
    mov r184, r179.x000
    ieq r177.x___, r177.xxxx, r178.xxxx
    cmov_logical r178.x___, r182.xxxx, r184.xxxx, r183.xxxx
    mov r182, r179.y000
    cmov_logical r176.x___, r176.xxxx, r182.xxxx, r178.xxxx
    mov r178, r179.z000
    cmov_logical r176.x___, r177.xxxx, r178.xxxx, r176.xxxx
    ishl r176.x___, r176.xxxx, r181.xxxx
    mov r1098, l14
    mov r1.x___, r1098.xxxx
    call 1064 ; get32BitLoadPrivate
    mov r177.x___, r1.xxxx
    iadd r177.x___, r177.xxxx, r176.xxxx
    mov r1099, l10
    mov r1.x___, r1099.xxxx
    call 1064 ; get32BitLoadPrivate
    mov r178.x___, r1.xxxx
    mov r1.x___, r177.xxxx
    call 1083 ; get32BitLoadUAV
    mov r177.x___, r1.xxxx
    mov r179, l16
    iadd r177.x___, r177.xxxx, r179.xxxx
    iadd r176.x___, r178.xxxx, r176.xxxx
    mov r2.x___, r177.xxxx
    mov r1.x___, r176.xxxx
    call 1078 ; get32BitStoreUAV
    ret
endfunc ; k0
;ARGSTART:k0
;uniqueid:1205
;memory:hwprivate:0
;memory:local:0
;function:6:1027:1061:1064:1066:1078:1083
;intrinsic:0
;ARGEND:k0
func 1027 ; get_global_id
iadd r1020, r1.xxxx, l1.0yzw
ieq r1020, r1020, l0.0000
cmov_logical r1.x, r1020.x, r1021.x, r1021.0
cmov_logical r1.x, r1020.y, r1021.y, r1.x
cmov_logical r1.x, r1020.z, r1021.z, r1.x
ret
endfunc ; get_global_id
;ARGSTART:compiler_intrinsic
;uniqueid:1027
;ARGEND:compiler_intrinsic
func 1061 ; Store32BitsPrivate
ishr r1.x, r1.x, l0.z
and r1.w, r1.x, l0.w
ishr r1.x, r1.x, l0.z
switch r1.w
default
    mov x0[r1.x].x___, r2.x
break
case 1
    mov x0[r1.x]._y__, r2.x
break
case 2
    mov x0[r1.x].__z_, r2.x
break
case 3
    mov x0[r1.x].___w, r2.x
break
endswitch
ret
endfunc ; Store32BitsPrivate
;ARGSTART:compiler_intrinsic
;uniqueid:1061
;ARGEND:compiler_intrinsic
func 1064 ; Load32BitsPrivate
ishr r1.x, r1.x, l0.z
and r1.w, r1.x, l0.w
ishr r1.x, r1.x, l0.z
iadd r1020, r1.wwww, l1.0yzw
ieq r1020, r1020, l0.0000
mov r1, x0[r1.x]
cmov_logical r1.x, r1020.y, r1.y, r1.x
cmov_logical r1.x, r1020.z, r1.z, r1.x
cmov_logical r1.x, r1020.w, r1.w, r1.x
ret
endfunc ; Load32BitsPrivate
;ARGSTART:compiler_intrinsic
;uniqueid:1064
;ARGEND:compiler_intrinsic
func 1066 ; Load128BitsPrivate
ishr r1.x, r1.x, l0.x
mov r1, x0[r1.x]
ret
endfunc ; Load128BitsPrivate
;ARGSTART:compiler_intrinsic
;uniqueid:1066
;ARGEND:compiler_intrinsic
func 1078 ; Store32BitsUAV
uav_raw_store_id(1) mem0.x___, r1.x, r2
ret
endfunc ; Store32BitsUAV
;ARGSTART:compiler_intrinsic
;uniqueid:1078
;ARGEND:compiler_intrinsic
func 1083 ; Load32BitsUAV
uav_raw_load_id(1) r1.x___, r1.x
ret
endfunc ; Load32BitsUAV
;ARGSTART:compiler_intrinsic
;uniqueid:1083
;ARGEND:compiler_intrinsic

end

0 Likes
2 Replies
ryta1203
Journeyman III

I'm not sure about your particular example, but just as a general guideline: IL != ISA instructions

I have seen a lot of long IL kernels that when compiled end up in fairly short ISA.

So, what I'm asking is, what does the ISA look like?

0 Likes

abaralgin,
The problem here is you are not using the 'official' tool chain. Please use GPU_DUMP_DEVICE_KERNEL=1 to get the IL. By using clc/llc directly, you are doing the equivalent of -O0 in GCC, which means there are no optimizations performed on the code, including removal of stack allocations of variables.
0 Likes