/* C code for complex FFT of size 8
Each complex vector contains two double precision floats (real and imaginary)
So they can be packed in a SIMD vector.
*/
void sfnyvmc8(register CPLEX *x, register CPLEX *y, register INT is, register INT os, register INT v, register INT vis, register INT vos)
{
const REAL C2 = 0.70710676908493; /* REALCONST */
REAL *xre, *xim, *yre, *yim;
INT i;
xre=(REAL *)x; xim=xre+1; yre=(REAL *)y; yim=yre+1;
is=(is<<1); os=(os<<1);
vis=(vis<<1); vos=(vos<<1);
for(i=0; i<v; i=i+1)
{
register REAL tmp4r, tmp5r, tmp6r, tmp7r, tmp12r, tmp13r, tmp14r, tmp15r,
tmp16r, tmp17r;
register REAL tmp4i, tmp5i, tmp6i, tmp7i, tmp12i, tmp13i, tmp14i, tmp15i,
tmp16i, tmp17i;
{
register REAL tmp0r, tmp1r, tmp2r, tmp3r;
register REAL tmp0i, tmp1i, tmp2i, tmp3i;
tmp0i = xim[0]+xim[4*is];
tmp0r = xre[0]+xre[4*is];
tmp1i = xim[0]-xim[4*is];
tmp1r = xre[0]-xre[4*is];
tmp2i = xim[2*is]+xim[6*is];
tmp2r = xre[2*is]+xre[6*is];
tmp3i = xim[2*is]-xim[6*is];
tmp3r = xre[2*is]-xre[6*is];
tmp4i = tmp0i+tmp2i;
tmp4r = tmp0r+tmp2r;
tmp5i = tmp0i-tmp2i;
tmp5r = tmp0r-tmp2r;
tmp6i = tmp1i-tmp3r;
tmp6r = tmp1r+tmp3i;
tmp7i = tmp1i+tmp3r;
tmp7r = tmp1r-tmp3i;
}
{
register REAL tmp8r, tmp9r, tmp10r, tmp11r;
register REAL tmp8i, tmp9i, tmp10i, tmp11i;
tmp8i = xim[is]+xim[5*is];
tmp8r = xre[is]+xre[5*is];
tmp9i = xim[is]-xim[5*is];
tmp9r = xre[is]-xre[5*is];
tmp10i = xim[3*is]+xim[7*is];
tmp10r = xre[3*is]+xre[7*is];
tmp11i = xim[3*is]-xim[7*is];
tmp11r = xre[3*is]-xre[7*is];
tmp12i = tmp8i+tmp10i;
tmp12r = tmp8r+tmp10r;
tmp13i = tmp8i-tmp10i;
tmp13r = tmp8r-tmp10r;
tmp14i = tmp9i-tmp11r;
tmp14r = tmp9r+tmp11i;
tmp15i = tmp9i+tmp11r;
tmp15r = tmp9r-tmp11i;
}
tmp16i = C2*(tmp14i-tmp14r);
tmp16r = C2*(tmp14r+tmp14i);
tmp17i = -C2*(tmp15i+tmp15r);
tmp17r = -C2*(tmp15r-tmp15i);
yim[0] = tmp4i+tmp12i;
yre[0] = tmp4r+tmp12r;
yim[4*os] = tmp4i-tmp12i;
yre[4*os] = tmp4r-tmp12r;
yim[os] = tmp6i+tmp16i;
yre[os] = tmp6r+tmp16r;
yim[5*os] = tmp6i-tmp16i;
yre[5*os] = tmp6r-tmp16r;
yim[2*os] = tmp5i-tmp13r;
yre[2*os] = tmp5r+tmp13i;
yim[6*os] = tmp5i+tmp13r;
yre[6*os] = tmp5r-tmp13i;
yim[3*os] = tmp7i+tmp17i;
yre[3*os] = tmp7r+tmp17r;
yim[7*os] = tmp7i-tmp17i;
yre[7*os] = tmp7r-tmp17r;
xre=xre+vis;
yre=yre+vos;
xim=xre+1;
yim=yre+1;
}
}
/* SIMD code for FFT of size 8 complex vector
VLD, VST, VSUB, VADD map directly to intrinsics
VCONJI is composite of shuffle and xor
VCMUL is composite that packs two constants in a SIMD vector and performs complex multiplication between complex numbers
VCMUL(cre,cim,b){
V are = _mm_load1_pd(cre);
V aim = _mm_load1_pd(cim);
are = VMUL(b, are);
b = VCONJ(b);
aim = VMUL(aim, b);
return VADD(are,aim);
}
Each array element is aligned to 128 bit boundaries
*/
void zfnyvmc8(register CPLEX *x, register CPLEX *y, register INT is, register INT os, register INT v, register INT vis, register INT vos)
{
const REAL C0 = 0; /* ZEROCONST */
const REAL C2 = 0.70710678118655; /* REALCONST */
const REAL C1 = 1; /* ONECONST */
INT i;
for(i=0; i<v; i=i+1)
{
register V tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15,
tmp16, tmp17, tmp18, tmp24, tmp25, tmp26, tmp27, tmp33,
tmp34, tmp35, tmp36;
tmp0 = VLD((REAL *)&x[0]);
tmp1 = VLD((REAL *)&x[is]);
tmp2 = VLD((REAL *)&x[2*is]);
tmp3 = VLD((REAL *)&x[3*is]);
tmp4 = VLD((REAL *)&x[4*is]);
tmp5 = VLD((REAL *)&x[5*is]);
tmp6 = VLD((REAL *)&x[6*is]);
tmp7 = VLD((REAL *)&x[7*is]);
tmp8 = VADD(tmp0, tmp4);
tmp9 = VSUB(tmp0, tmp4);
tmp10 = VADD(tmp1, tmp5);
tmp11 = VSUB(tmp1, tmp5);
tmp12 = VADD(tmp2, tmp6);
tmp13 = VSUB(tmp2, tmp6);
tmp14 = VADD(tmp3, tmp7);
tmp15 = VSUB(tmp3, tmp7);
tmp16 = VCMUL(C2, -C2, tmp11);
tmp17 = VCONJI(tmp13);
tmp18 = VCMUL(-C2, -C2, tmp15);
{
register V tmp19, tmp20, tmp21, tmp22, tmp23;
tmp19 = VADD(tmp8, tmp12);
tmp20 = VSUB(tmp8, tmp12);
tmp21 = VADD(tmp10, tmp14);
tmp22 = VSUB(tmp10, tmp14);
tmp23 = VCONJI(tmp22);
tmp24 = VADD(tmp19, tmp21);
tmp25 = VSUB(tmp19, tmp21);
tmp26 = VADD(tmp20, tmp23);
tmp27 = VSUB(tmp20, tmp23);
}
{
register V tmp28, tmp29, tmp30, tmp31, tmp32;
tmp28 = VADD(tmp9, tmp17);
tmp29 = VSUB(tmp9, tmp17);
tmp30 = VADD(tmp16, tmp18);
tmp31 = VSUB(tmp16, tmp18);
tmp32 = VCONJI(tmp31);
tmp33 = VADD(tmp28, tmp30);
tmp34 = VSUB(tmp28, tmp30);
tmp35 = VADD(tmp29, tmp32);
tmp36 = VSUB(tmp29, tmp32);
}
VST((REAL *)&y[0], tmp24);
VST((REAL *)&y[os], tmp33);
VST((REAL *)&y[2*os], tmp26);
VST((REAL *)&y[3*os], tmp35);
VST((REAL *)&y[4*os], tmp25);
VST((REAL *)&y[5*os], tmp34);
VST((REAL *)&y[6*os], tmp27);
VST((REAL *)&y[7*os], tmp36);
x=x+vis; y=y+vos;
}
}
Performance of SIMD code is >30% slower in this case.
Hardware counters data:
Perf (nonsimd=80%,simd=50%)
So the question is why is simd code slower?
FP_OPS (nonsimd=78, simd=120)
Why does it have more FP_OPS than the floating point code?
FP_INS (nonsimd=122, simd=102) VEC_INS(nonsimd=0, simd=94)
Simd code has lower number of FP_INS; that makes sense.
FAD_INS (nonsimd=63, simd=74)
Simd code has 26 vector adds, i.e., 52 FP_ADD instructions.
I can send you the assembly if you like.