/* * Copyright (c) 2007 Codesourcery * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #ifndef _SIMD_SIMD_MIPS_PS_H #define _SIMD_SIMD_MIPS_PS_H #ifndef FFTW_SINGLE #error "MIPS PS only works in single precision" #endif #define VL 1 /* SIMD complex vector length */ #define ALIGNMENT 8 /* alignment for LD/ST */ #define ALIGNMENTA 8 /* alignment for LDA/STA */ #define SIMD_VSTRIDE_OKA(x) 1 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA /* Codelets allows stores to be performed in one of two ways: - Values can be written back as they are generated. This potentially lowers register pressure by freeing the register holding the value for reuse. - Values can be written back in groups to consecutive addresses. This potentially lowers the cost of address generation. */ #define USE_MULTIPLE_STORES 0 #define RIGHT_CPU X(have_mips_ps) extern int RIGHT_CPU(void); #if defined(__mips__) #include typedef psingle V; #define LDK(x) x #define DVK(var, val) \ const V var = _mips64_lc(val) #define VADD _mips64_add #define VSUB _mips64_sub #define VMUL _mips64_mul #define VFMS _mips64_msub #define VFMA _mips64_madd #define VFNMS _mips64_nmsub #define UNPCKL _mips64_pl #define UNPCKH _mips64_pu #define STOREL _mips64_sl #define STOREH _mips64_su #define SHUFFLE _mips64_shuffle #define FLIP_RI SHUFFLE #define VCONJ _mips64_chs_i #define VFMACONJ(b,c) VADD(VCONJ(b),c) #define VFMSCONJ(b,c) VSUB(VCONJ(b),c) #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b)) static inline V LDA(const R *x, INT ivs, const R *aligned_like) { (void)aligned_like; (void)ivs; return _mips64_ld((V*)x); } #define LD LDA static inline void STA(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; (void)ovs; _mips64_st(x, v); } #define ST STA #if USE_MULTIPLE_STORES # define STM2(x,v,ovs,a) /* nop */ static inline void STN2(R *x, V v0, V v1, INT ovs) { (void)ovs; STA(x, v0, 0, 0); STA(x+2, v1, 0, 0); } #else /* !USE_MULTIPLE_STORES */ # define STM2 STA # define STN2(x,v0,v1,ovs) /* nop */ #endif #if USE_MULTIPLE_STORES # define STM4(x,v,ovs,a) /* nop */ static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs) { R *x_ptr = x; STA(x_ptr, UNPCKL(v0,v1), 0, 0); x_ptr += ovs; STA(x_ptr, UNPCKL(v2,v3), 0, 0); x_ptr += ovs; STA(x_ptr, UNPCKH(v0,v1), 0, 0); x_ptr += ovs; STA(x_ptr, UNPCKH(v2,v3), 0, 0); } #else /* !USE_MULTIPLE_STORES */ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ STOREL(x, v); STOREH(x + ovs, v); } # define STN4(x, v0, v1, v2, v3, ovs) /* nop */ #endif static inline V VBYI(V x) { x = VCONJ(x); x = FLIP_RI(x); return x; } #define VFMAI(b, c) VADD(c, VBYI(b)) #define VFNMSI(b, c) VSUB(c, VBYI(b)) static inline V VZMUL(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); tr = VMUL(sr, tr); sr = VBYI(sr); return VFMA(ti,sr,tr); } static inline V VZMULJ(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); tr = VMUL(sr, tr); sr = VBYI(sr); return VFNMS(ti,sr,tr); } /* twiddle storage #1: compact, slower */ #define VTW1(v,x) {TW_CEXP, v, x} #define TWVL1 1 static inline V BYTW1(const R *t, V sr) { V tx = LD(t, 1, t); return VZMUL(tx, sr); } static inline V BYTWJ1(const R *t, V sr) { V tx = LD(t, 1, t); return VZMULJ(tx, sr); } /* twiddle storage #2: twice the space, faster (when in cache) */ #define VTW2(v,x) \ {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x} #define TWVL2 2 static inline V BYTW2(const R *t, V sr) { const V *twp = (const V *)t; V si = FLIP_RI(sr); V tr = twp[0], ti = twp[1]; return VFMA(tr,sr,VMUL(ti,si)); } static inline V BYTWJ2(const R *t, V sr) { const V *twp = (const V *)t; V si = FLIP_RI(sr); V tr = twp[0], ti = twp[1]; return VFNMS(ti,si, VMUL(tr,sr)); } static inline V VZMULI(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); ti = VMUL(ti, sr); sr = VBYI(sr); return VFMS(tr,sr,ti); } static inline V VZMULIJ(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); ti = VMUL(ti, sr); sr = VBYI(sr); return VFMA(tr,sr,ti); } /* twiddle storage #3 */ #define VTW3(v,x) VTW1(v,x) #define TWVL3 TWVL1 /* twiddle storage for split arrays */ #define VTWS(v,x) \ {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x} #define TWVLS (2 * VL) #endif /* defined(__mips__) */ #endif /* _SIMD_SIMD_MIPS_PS_H */