Archives Discussions

gaddo · ‎12-15-2011

Hello,

i am experimenting with the OpenCL compiler with the CPU target.

i am trying to get the compiler to generate a movq

with this kernel , the code that i get

(I am using the wonderful AMD APP KernelAnalyzer 1.10.1149)

__kernel void move(__global uchar * outputv,
                   __global uchar * inputv)
{
    uint tid = get_global_id(0) * 8;
    uchar8 i = vload8(tid, inputv);
    vstore8(i, tid, outputv);
}

    .def    ___OpenCL_move_kernel;
    .scl    2;
    .type    32;
    .endef
    .globl    ___OpenCL_move_kernel
    .align    16, 0x90
___OpenCL_move_kernel:                  # @__OpenCL_move_kernel
# BB#0:                                 # %entry
    pushl    %ebx
    pushl    %esi
    subl    $8, %esp
    leal    8195(%esp), %eax
    movl    24(%esp), %ecx
    movl    20(%esp), %esi
    andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
    movl    -16(%eax), %eax
    shll    $6, %eax
    movb    7(%ecx,%eax), %dl
    movb    4(%ecx,%eax), %dh
    movb    3(%ecx,%eax), %bl
    movb    2(%ecx,%eax), %bh
    movb    %dl, 3(%esp)            # 1-byte Spill
    movb    6(%ecx,%eax), %dl
    movb    %dl, 2(%esp)            # 1-byte Spill
    movb    5(%ecx,%eax), %dl
    movb    %dl, 1(%esp)            # 1-byte Spill
    movb    (%ecx,%eax), %dl
    movb    1(%ecx,%eax), %cl
    movb    2(%esp), %ch            # 1-byte Reload
    movb    %dl, (%esi,%eax)
    movb    %cl, 1(%esi,%eax)
    movb    1(%esp), %cl            # 1-byte Reload
    movb    %bh, 2(%esi,%eax)
    movb    %bl, 3(%esi,%eax)
    movb    %dh, 4(%esi,%eax)
    movb    %cl, 5(%esi,%eax)
    movb    3(%esp), %cl            # 1-byte Reload
    movb    %ch, 6(%esi,%eax)
    movb    %cl, 7(%esi,%eax)
    addl    $8, %esp
    popl    %esi
    popl    %ebx
    ret

so the vload is loading and storing 1byte at the time 😞

with this kernel

__kernel void move2(__global uchar8 * outputv,
                   __global uchar8 * inputv)
{
    uint tid = get_global_id(0) * 8;
    uchar8 i = inputv[tid];
    outputv[tid] = i;
}

the generated code is

    .def    ___OpenCL_move2_kernel;
    .scl    2;
    .type    32;
    .endef
    .globl    ___OpenCL_move2_kernel
    .align    16, 0x90
___OpenCL_move2_kernel:                 # @__OpenCL_move2_kernel
# BB#0:                                 # %entry
    pushl    %eax
    leal    8191(%esp), %eax
    movl    12(%esp), %ecx
    movl    8(%esp), %edx
    andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
    movl    -16(%eax), %eax
    shll    $6, %eax
    movsd    (%ecx,%eax), %xmm0
    movsd    %xmm0, (%edx,%eax)
    popl    %eax
    ret

    .def    ___OpenCL_move2_stub;
    .scl    2;
    .type    32;
    .endef
    .globl    ___OpenCL_move2_stub
    .align    16, 0x90
___OpenCL_move2_stub:                   # @__OpenCL_move2_stub
# BB#0:                                 # %entry
    pushl    %eax
    leal    8191(%esp), %edx
    movl    8(%esp), %eax
    andl    $-8192, %edx            # imm = 0xFFFFFFFFFFFFE000
    movl    -16(%edx), %edx
    movl    (%eax), %ecx
    movl    4(%eax), %eax
    shll    $6, %edx
    movsd    (%eax,%edx), %xmm0
    movsd    %xmm0, (%ecx,%edx)
    popl    %eax
    ret

so the move is done with the integer register 4 bytes at the time. Better, but not perfect.

any idea ?

another question:

in the generated code i see two implementations:

___OpenCL_move2_kernel

and

___OpenCL_move2_stub

why ?

thank you, ...

gaddo · ‎12-15-2011

of course i am a fool: the move2 kernel actually use a a mvsd

but with this kernel

__kernel void move2(__global uchar8 * outputv,
                   __global uchar8 * inputv)
{
    uint tid = get_global_id(0) * 8;
    uchar8 i = inputv[tid];
    outputv[tid] = i*2;
}

the generate code is

    .def    ___OpenCL_move2_kernel;
    .scl    2;
    .type    32;
    .endef
    .globl    ___OpenCL_move2_kernel
    .align    16, 0x90
___OpenCL_move2_kernel:                 # @__OpenCL_move2_kernel
# BB#0:                                 # %entry
    pushl    %ebx
    pushl    %esi
    pushl    %eax
    leal    8191(%esp), %eax
    movl    20(%esp), %ecx
    andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
    movl    -16(%eax), %eax
    shll    $6, %eax
    movd    (%ecx,%eax), %xmm0
    movd    4(%ecx,%eax), %xmm1
    movlhps    %xmm0, %xmm1            # xmm1 = xmm1[0],xmm0[0]
    shufps    $-30, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
    pxor    %xmm0, %xmm0
    pextrw    $1, %xmm1, %ecx
    movd    %xmm1, %ebx
    movl    %ecx, %edx
    addb    %cl, %cl
    addb    %bl, %bl
    andl    $65280, %edx            # imm = 0xFF00
    movzbl    %cl, %ecx
    movzbl    %bl, %esi
    addl    %edx, %edx
    orl    %edx, %ecx
    pextrw    $0, %xmm1, %edx
    andl    $65280, %edx            # imm = 0xFF00
    addl    %edx, %edx
    orl    %edx, %esi
    pinsrw    $0, %esi, %xmm0
    pinsrw    $1, %ecx, %xmm0
    pextrw    $2, %xmm1, %ecx
    movl    %ecx, %edx
    addb    %cl, %cl
    andl    $65280, %edx            # imm = 0xFF00
    movzbl    %cl, %ecx
    addl    %edx, %edx
    orl    %edx, %ecx
    pinsrw    $2, %ecx, %xmm0
    pextrw    $3, %xmm1, %ecx
    movl    %ecx, %edx
    addb    %cl, %cl
    andl    $65280, %edx            # imm = 0xFF00
    movzbl    %cl, %ecx
    addl    %edx, %edx
    orl    %edx, %ecx
    movl    16(%esp), %edx
    pinsrw    $3, %ecx, %xmm0
    movd    %xmm0, (%edx,%eax)
    pshufd    $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
    movd    %xmm0, 4(%edx,%eax)
    addl    $4, %esp
    popl    %esi
    popl    %ebx
    ret

😞

eugenek · ‎12-16-2011

Wow, that second version is really horrible. Maybe optimizations were disabled??

gaddo · ‎12-16-2011

if i understand the documentation correctly, the code generation is optimized by default and i have to ask to disable optimization if i need to.

if so, no, i did not disable the optimizations ..

Archives Discussions

help understanding the generated code