/* * Copyright (c) 2003, 2007-8 Matteo Frigo * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ /* Generated by: ../../genfft/gen_twiddle_c -standalone -fma -reorder-insns -simd -compact -variables 100000 -include fftw-spu.h -trivial-stores -n 8 -name X(spu_t1fv_8) */ /* * This function contains 33 FP additions, 24 FP multiplications, * (or, 23 additions, 14 multiplications, 10 fused multiply/add), * 44 stack variables, 1 constants, and 16 memory accesses */ #include "fftw-spu.h" void X(spu_t1fv_8) (R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; R *x; x = ri; for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(rs)) { V T4, Tu, Tl, Tv, Tx, Ty, Tf, Tm, T1, T3, T2, Ti, Tk, Th, Tj; V T6, Td, T8, Tb, T5, Tc, T7, Ta, T9, Te, Tq, Tr, Tw, Tz, To; V Tp, Tg, Tn, TE, TF, TC, TD, Ts, Tt, TA, TB; T1 = LD(&(x[0]), ms, &(x[0])); T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0])); T3 = BYTWJ(&(W[TWVL * 6]), T2); T4 = VSUB(T1, T3); Tu = VADD(T1, T3); Th = LD(&(x[WS(rs, 2)]), ms, &(x[0])); Ti = BYTWJ(&(W[TWVL * 2]), Th); Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0])); Tk = BYTWJ(&(W[TWVL * 10]), Tj); Tl = VSUB(Ti, Tk); Tv = VADD(Ti, Tk); T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T6 = BYTWJ(&(W[0]), T5); Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); Td = BYTWJ(&(W[TWVL * 4]), Tc); T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)])); T8 = BYTWJ(&(W[TWVL * 8]), T7); Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)])); Tb = BYTWJ(&(W[TWVL * 12]), Ta); Tx = VADD(T6, T8); T9 = VSUB(T6, T8); Te = VSUB(Tb, Td); Ty = VADD(Tb, Td); Tf = VADD(T9, Te); Tm = VSUB(Te, T9); Tg = VFMA(LDK(KP707106781), Tf, T4); Tq = VFNMS(LDK(KP707106781), Tf, T4); Tr = VFMA(LDK(KP707106781), Tm, Tl); Tn = VFNMS(LDK(KP707106781), Tm, Tl); To = VFNMSI(Tn, Tg); Tp = VFMAI(Tn, Tg); ST(&(x[WS(rs, 1)]), To, ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 7)]), Tp, ms, &(x[WS(rs, 1)])); TC = VSUB(Tu, Tv); Tw = VADD(Tu, Tv); Tz = VADD(Tx, Ty); TD = VSUB(Ty, Tx); TE = VFNMSI(TD, TC); TF = VFMAI(TD, TC); ST(&(x[WS(rs, 6)]), TE, ms, &(x[0])); ST(&(x[WS(rs, 2)]), TF, ms, &(x[0])); Ts = VFNMSI(Tr, Tq); Tt = VFMAI(Tr, Tq); ST(&(x[WS(rs, 5)]), Ts, ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 3)]), Tt, ms, &(x[WS(rs, 1)])); TA = VSUB(Tw, Tz); TB = VADD(Tw, Tz); ST(&(x[WS(rs, 4)]), TA, ms, &(x[0])); ST(&(x[0]), TB, ms, &(x[0])); } }