Hello,
i am experimenting with the OpenCL compiler with the CPU target.
i am trying to get the compiler to generate a movq
with this kernel , the code that i get
(I am using the wonderful AMD APP KernelAnalyzer 1.10.1149)
__kernel void move(__global uchar * outputv,
__global uchar * inputv)
{
uint tid = get_global_id(0) * 8;
uchar8 i = vload8(tid, inputv);
vstore8(i, tid, outputv);
}
.def ___OpenCL_move_kernel;
.scl 2;
.type 32;
.endef
.globl ___OpenCL_move_kernel
.align 16, 0x90
___OpenCL_move_kernel: # @__OpenCL_move_kernel
# BB#0: # %entry
pushl %ebx
pushl %esi
subl $8, %esp
leal 8195(%esp), %eax
movl 24(%esp), %ecx
movl 20(%esp), %esi
andl $-8192, %eax # imm = 0xFFFFFFFFFFFFE000
movl -16(%eax), %eax
shll $6, %eax
movb 7(%ecx,%eax), %dl
movb 4(%ecx,%eax), %dh
movb 3(%ecx,%eax), %bl
movb 2(%ecx,%eax), %bh
movb %dl, 3(%esp) # 1-byte Spill
movb 6(%ecx,%eax), %dl
movb %dl, 2(%esp) # 1-byte Spill
movb 5(%ecx,%eax), %dl
movb %dl, 1(%esp) # 1-byte Spill
movb (%ecx,%eax), %dl
movb 1(%ecx,%eax), %cl
movb 2(%esp), %ch # 1-byte Reload
movb %dl, (%esi,%eax)
movb %cl, 1(%esi,%eax)
movb 1(%esp), %cl # 1-byte Reload
movb %bh, 2(%esi,%eax)
movb %bl, 3(%esi,%eax)
movb %dh, 4(%esi,%eax)
movb %cl, 5(%esi,%eax)
movb 3(%esp), %cl # 1-byte Reload
movb %ch, 6(%esi,%eax)
movb %cl, 7(%esi,%eax)
addl $8, %esp
popl %esi
popl %ebx
ret
so the vload is loading and storing 1byte at the time 😞
with this kernel
__kernel void move2(__global uchar8 * outputv,
__global uchar8 * inputv)
{
uint tid = get_global_id(0) * 8;
uchar8 i = inputv[tid];
outputv[tid] = i;
}
the generated code is
.def ___OpenCL_move2_kernel;
.scl 2;
.type 32;
.endef
.globl ___OpenCL_move2_kernel
.align 16, 0x90
___OpenCL_move2_kernel: # @__OpenCL_move2_kernel
# BB#0: # %entry
pushl %eax
leal 8191(%esp), %eax
movl 12(%esp), %ecx
movl 8(%esp), %edx
andl $-8192, %eax # imm = 0xFFFFFFFFFFFFE000
movl -16(%eax), %eax
shll $6, %eax
movsd (%ecx,%eax), %xmm0
movsd %xmm0, (%edx,%eax)
popl %eax
ret
.def ___OpenCL_move2_stub;
.scl 2;
.type 32;
.endef
.globl ___OpenCL_move2_stub
.align 16, 0x90
___OpenCL_move2_stub: # @__OpenCL_move2_stub
# BB#0: # %entry
pushl %eax
leal 8191(%esp), %edx
movl 8(%esp), %eax
andl $-8192, %edx # imm = 0xFFFFFFFFFFFFE000
movl -16(%edx), %edx
movl (%eax), %ecx
movl 4(%eax), %eax
shll $6, %edx
movsd (%eax,%edx), %xmm0
movsd %xmm0, (%ecx,%edx)
popl %eax
ret
so the move is done with the integer register 4 bytes at the time. Better, but not perfect.
any idea ?
another question:
in the generated code i see two implementations:
___OpenCL_move2_kernel
and
___OpenCL_move2_stub
why ?
thank you, ...
of course i am a fool: the move2 kernel actually use a a mvsd
but with this kernel
__kernel void move2(__global uchar8 * outputv,
__global uchar8 * inputv)
{
uint tid = get_global_id(0) * 8;
uchar8 i = inputv[tid];
outputv[tid] = i*2;
}
the generate code is
.def ___OpenCL_move2_kernel;
.scl 2;
.type 32;
.endef
.globl ___OpenCL_move2_kernel
.align 16, 0x90
___OpenCL_move2_kernel: # @__OpenCL_move2_kernel
# BB#0: # %entry
pushl %ebx
pushl %esi
pushl %eax
leal 8191(%esp), %eax
movl 20(%esp), %ecx
andl $-8192, %eax # imm = 0xFFFFFFFFFFFFE000
movl -16(%eax), %eax
shll $6, %eax
movd (%ecx,%eax), %xmm0
movd 4(%ecx,%eax), %xmm1
movlhps %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0]
shufps $-30, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3]
pxor %xmm0, %xmm0
pextrw $1, %xmm1, %ecx
movd %xmm1, %ebx
movl %ecx, %edx
addb %cl, %cl
addb %bl, %bl
andl $65280, %edx # imm = 0xFF00
movzbl %cl, %ecx
movzbl %bl, %esi
addl %edx, %edx
orl %edx, %ecx
pextrw $0, %xmm1, %edx
andl $65280, %edx # imm = 0xFF00
addl %edx, %edx
orl %edx, %esi
pinsrw $0, %esi, %xmm0
pinsrw $1, %ecx, %xmm0
pextrw $2, %xmm1, %ecx
movl %ecx, %edx
addb %cl, %cl
andl $65280, %edx # imm = 0xFF00
movzbl %cl, %ecx
addl %edx, %edx
orl %edx, %ecx
pinsrw $2, %ecx, %xmm0
pextrw $3, %xmm1, %ecx
movl %ecx, %edx
addb %cl, %cl
andl $65280, %edx # imm = 0xFF00
movzbl %cl, %ecx
addl %edx, %edx
orl %edx, %ecx
movl 16(%esp), %edx
pinsrw $3, %ecx, %xmm0
movd %xmm0, (%edx,%eax)
pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
movd %xmm0, 4(%edx,%eax)
addl $4, %esp
popl %esi
popl %ebx
ret
😞
Wow, that second version is really horrible. Maybe optimizations were disabled??
if i understand the documentation correctly, the code generation is optimized by default and i have to ask to disable optimization if i need to.
if so, no, i did not disable the optimizations ..