gaddo

help understanding the generated code

Discussion created by gaddo on Dec 15, 2011
Latest reply on Dec 16, 2011 by gaddo

Hello,

i am experimenting with the OpenCL compiler with the CPU target.

i am trying to get the compiler to generate a movq

with this kernel , the code that i get 

(I am using the wonderful AMD APP KernelAnalyzer 1.10.1149)

__kernel void move(__global  uchar * outputv,
                   __global  uchar * inputv)
{
    uint tid = get_global_id(0) * 8;
    uchar8 i = vload8(tid, inputv);
    vstore8(i, tid, outputv);
}

    .def     ___OpenCL_move_kernel;
    .scl    2;
    .type    32;
    .endef
    .globl    ___OpenCL_move_kernel
    .align    16, 0x90
___OpenCL_move_kernel:                  # @__OpenCL_move_kernel
# BB#0:                                 # %entry
    pushl    %ebx
    pushl    %esi
    subl    $8, %esp
    leal    8195(%esp), %eax
    movl    24(%esp), %ecx
    movl    20(%esp), %esi
    andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
    movl    -16(%eax), %eax
    shll    $6, %eax
    movb    7(%ecx,%eax), %dl
    movb    4(%ecx,%eax), %dh
    movb    3(%ecx,%eax), %bl
    movb    2(%ecx,%eax), %bh
    movb    %dl, 3(%esp)            # 1-byte Spill
    movb    6(%ecx,%eax), %dl
    movb    %dl, 2(%esp)            # 1-byte Spill
    movb    5(%ecx,%eax), %dl
    movb    %dl, 1(%esp)            # 1-byte Spill
    movb    (%ecx,%eax), %dl
    movb    1(%ecx,%eax), %cl
    movb    2(%esp), %ch            # 1-byte Reload
    movb    %dl, (%esi,%eax)
    movb    %cl, 1(%esi,%eax)
    movb    1(%esp), %cl            # 1-byte Reload
    movb    %bh, 2(%esi,%eax)
    movb    %bl, 3(%esi,%eax)
    movb    %dh, 4(%esi,%eax)
    movb    %cl, 5(%esi,%eax)
    movb    3(%esp), %cl            # 1-byte Reload
    movb    %ch, 6(%esi,%eax)
    movb    %cl, 7(%esi,%eax)
    addl    $8, %esp
    popl    %esi
    popl    %ebx
    ret

so the vload is loading and storing 1byte at the time :(

with this kernel

__kernel void move2(__global  uchar8 * outputv,
                   __global  uchar8 * inputv)
{
    uint tid = get_global_id(0) * 8;
    uchar8 i = inputv[tid];
    outputv[tid] = i;
}

the generated code is

    .def     ___OpenCL_move2_kernel;
    .scl    2;
    .type    32;
    .endef
    .globl    ___OpenCL_move2_kernel
    .align    16, 0x90
___OpenCL_move2_kernel:                 # @__OpenCL_move2_kernel
# BB#0:                                 # %entry
    pushl    %eax
    leal    8191(%esp), %eax
    movl    12(%esp), %ecx
    movl    8(%esp), %edx
    andl    $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000
    movl    -16(%eax), %eax
    shll    $6, %eax
    movsd    (%ecx,%eax), %xmm0
    movsd    %xmm0, (%edx,%eax)
    popl    %eax
    ret

    .def     ___OpenCL_move2_stub;
    .scl    2;
    .type    32;
    .endef
    .globl    ___OpenCL_move2_stub
    .align    16, 0x90
___OpenCL_move2_stub:                   # @__OpenCL_move2_stub
# BB#0:                                 # %entry
    pushl    %eax
    leal    8191(%esp), %edx
    movl    8(%esp), %eax
    andl    $-8192, %edx            # imm = 0xFFFFFFFFFFFFE000
    movl    -16(%edx), %edx
    movl    (%eax), %ecx
    movl    4(%eax), %eax
    shll    $6, %edx
    movsd    (%eax,%edx), %xmm0
    movsd    %xmm0, (%ecx,%edx)
    popl    %eax
    ret

so the move is done with the integer register 4 bytes at the time. Better, but not perfect.

any idea ?

another question:

in the generated code i see two implementations:

___OpenCL_move2_kernel

and

___OpenCL_move2_stub

why ?

thank you, ...

 

 

 

Outcomes