Hi,

Let us consider two multiplicatoin kernels.

1. Simple multiplication using the *operator:

__kernel void multiplication(__global uint4 *src, __global uint4 *dst)

{

size_t i = get_global_id(0);

dst[i] = src[i] * 3;

};

X86 Assembler:

pushl %eax

leal 8191(%esp), %eax

movl 8(%esp), %ecx

movl 12(%esp), %edx

andl $-8192, %eax # imm = 0xFFFFFFFFFFFFE000

movl -16(%eax), %eax

shll $4, %eax

movdqa (%ecx,%eax), %xmm0

pshufd $3, %xmm0, %xmm1 # xmm1 = xmm0[3,0,0,0]

pshufd $1, %xmm0, %xmm2 # xmm2 = xmm0[1,0,0,0]

movd %xmm1, %ecx

leal (%ecx,%ecx,2), %ecx

movd %ecx, %xmm1

movd %xmm2, %ecx

leal (%ecx,%ecx,2), %ecx

movd %ecx, %xmm2

movd %xmm0, %ecx

movhlps %xmm0, %xmm0 # xmm0 = xmm0[1,1]

leal (%ecx,%ecx,2), %ecx

punpckldq %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]

movd %ecx, %xmm1

movd %xmm0, %ecx

leal (%ecx,%ecx,2), %ecx

movd %ecx, %xmm0

punpckldq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]

punpckldq %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]

movdqa %xmm1, (%edx,%eax)

popl %eax

ret

2. Simple multiplication using mul24 (or mul_hi):

__kernel void multiplication(__global uint4 *src, __global uint4 *dst)

{

size_t i = get_global_id(0);

dst[i] = mul24(src[i], 3);

};

X86 Assembler:

pushl %eax

leal 8191(%esp), %eax

movl 8(%esp), %ecx

movl 12(%esp), %edx

movdqa LCPI0_0, %xmm1

andl $-8192, %eax # imm = 0xFFFFFFFFFFFFE000

movl -16(%eax), %eax

shll $4, %eax

movdqa (%ecx,%eax), %xmm0

movdqa %xmm0, %xmm2

psrldq $4, %xmm0

pmuludq %xmm1, %xmm2

psrldq $4, %xmm1

pmuludq %xmm0, %xmm1

pshufd $-40, %xmm2, %xmm2 # xmm2 = xmm2[0,2,1,3]

pshufd $-40, %xmm1, %xmm0 # xmm0 = xmm1[0,2,1,3]

punpckldq %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]

movdqa %xmm2, (%edx,%eax)

popl %eax

ret

The question is why the code in the first case does not use pmuludq instruction as mul24 ? It is hard to follow what is going on there (where is a multiplication or addition?) but I do not believe that this code could be faster and by the way mul24 should be fastest method. pmuludq gives 64 bit result that is a good one to cast into any format CL compiler wants.

Thanks.

Is there any example of multiplication using assembly language in Randall Hyde's "The Art of Assembly Language"?