AnsweredAssumed Answered

Multiplication in x86 OpenCl kernels

Question asked by vladant on Jul 30, 2012
Latest reply on Jul 31, 2012 by vladant

Hi,

Let us consider two multiplicatoin kernels.

 

1. Simple multiplication using the *operator:

__kernel void multiplication(__global uint4 *src, __global uint4 *dst)
{
     size_t i = get_global_id(0);

 

     dst[i] = src[i] * 3;
};

 

X86 Assembler:

pushl %eax

leal 8191(%esp), %eax

movl 8(%esp), %ecx

movl 12(%esp), %edx

andl $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000

movl -16(%eax), %eax

shll $4, %eax

movdqa (%ecx,%eax), %xmm0

pshufd $3, %xmm0, %xmm1        # xmm1 = xmm0[3,0,0,0]

pshufd $1, %xmm0, %xmm2        # xmm2 = xmm0[1,0,0,0]

movd %xmm1, %ecx

leal (%ecx,%ecx,2), %ecx

movd %ecx, %xmm1

movd %xmm2, %ecx

leal (%ecx,%ecx,2), %ecx

movd %ecx, %xmm2

movd %xmm0, %ecx

movhlps %xmm0, %xmm0            # xmm0 = xmm0[1,1]

leal (%ecx,%ecx,2), %ecx

punpckldq %xmm1, %xmm2    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]

movd %ecx, %xmm1

movd %xmm0, %ecx

leal (%ecx,%ecx,2), %ecx

movd %ecx, %xmm0

punpckldq %xmm0, %xmm1    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]

punpckldq %xmm2, %xmm1    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]

movdqa %xmm1, (%edx,%eax)

popl %eax

ret

 

2. Simple multiplication using mul24 (or mul_hi):

__kernel void multiplication(__global uint4 *src, __global uint4 *dst)
{
     size_t i = get_global_id(0);

 

     dst[i] = mul24(src[i], 3);
};

 

X86 Assembler:

pushl %eax

leal 8191(%esp), %eax

movl 8(%esp), %ecx

movl 12(%esp), %edx

movdqa LCPI0_0, %xmm1

andl $-8192, %eax            # imm = 0xFFFFFFFFFFFFE000

movl -16(%eax), %eax

shll $4, %eax

movdqa (%ecx,%eax), %xmm0

movdqa %xmm0, %xmm2

psrldq $4, %xmm0

pmuludq %xmm1, %xmm2

psrldq $4, %xmm1

pmuludq %xmm0, %xmm1

pshufd $-40, %xmm2, %xmm2      # xmm2 = xmm2[0,2,1,3]

pshufd $-40, %xmm1, %xmm0      # xmm0 = xmm1[0,2,1,3]

punpckldq %xmm0, %xmm2    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]

movdqa %xmm2, (%edx,%eax)

popl %eax

ret

 

The question is why the code in the first case does not use pmuludq instruction as mul24 ? It is hard to follow what is going on there (where is a multiplication or addition?) but I do not believe that this code could be faster and by the way mul24 should be fastest method. pmuludq gives 64 bit result that is a good one to cast into any format CL compiler wants.

 

Thanks.

Outcomes