From fce4dad356b0d49d76c5ecfd8deda207ed53f0d8 Mon Sep 17 00:00:00 2001 From: Evidence John Date: Tue, 1 Jun 2021 15:43:45 +0800 Subject: [PATCH] Support MSA SIMD for MIPS --enable-msa now works in single and double precision for MIPS. Tested on both 32-bit and 64-bit MIPS run in little-endian. --- Makefile.am | 7 +- api/version.c | 4 + cmake.config.h.in | 3 + configure.ac | 20 ++- dft/codelet-dft.h | 1 + dft/conf.c | 4 + dft/simd/Makefile.am | 2 +- dft/simd/msa/Makefile.am | 13 ++ doc/install.texi | 1 + doc/intro.texi | 3 +- doc/other.texi | 5 +- kernel/ifftw.h | 3 +- rdft/codelet-rdft.h | 1 + rdft/conf.c | 4 + rdft/simd/Makefile.am | 2 +- rdft/simd/msa/Makefile.am | 13 ++ simd-support/Makefile.am | 1 + simd-support/msa.c | 73 ++++++++ simd-support/simd-common.h | 3 + simd-support/simd-msa.h | 348 +++++++++++++++++++++++++++++++++++++ 20 files changed, 501 insertions(+), 10 deletions(-) create mode 100644 dft/simd/msa/Makefile.am create mode 100644 rdft/simd/msa/Makefile.am create mode 100644 simd-support/msa.c create mode 100644 simd-support/simd-msa.h diff --git a/Makefile.am b/Makefile.am index eaf131cca..3d4170bdd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -94,6 +94,11 @@ NEON_LIBS = dft/simd/neon/libdft_neon_codelets.la \ rdft/simd/neon/librdft_neon_codelets.la endif +if HAVE_MSA +MSA_LIBS = dft/simd/msa/libdft_msa_codelets.la \ +rdft/simd/msa/librdft_msa_codelets.la +endif + if HAVE_GENERIC_SIMD128 GENERIC_SIMD128_LIBS = dft/simd/generic-simd128/libdft_generic_simd128_codelets.la \ rdft/simd/generic-simd128/librdft_generic_simd128_codelets.la @@ -125,7 +130,7 @@ libfftw3@PREC_SUFFIX@_la_LIBADD = \ reodft/libreodft.la \ api/libapi.la \ $(SIMD_LIBS) $(SSE2_LIBS) $(AVX_LIBS) $(AVX_128_FMA_LIBS) \ - $(AVX2_LIBS) $(ALTIVEC_LIBS) \ + $(AVX2_LIBS) $(ALTIVEC_LIBS) $(MSA_LIBS) \ $(VSX_LIBS) $(NEON_LIBS) $(KCVI_LIBS) $(AVX512_LIBS) \ $(GENERIC_SIMD128_LIBS) $(GENERIC_SIMD256_LIBS) \ $(COMBINED_THREADLIBS) diff --git a/api/version.c b/api/version.c index 4f14de157..269488dcd 100644 --- a/api/version.c +++ b/api/version.c @@ -77,6 +77,10 @@ const char X(version)[] = PACKAGE "-" PACKAGE_VERSION "-neon" #endif +#if HAVE_MSA + "-msa" +#endif + #if defined(HAVE_GENERIC_SIMD128) "-generic_simd128" #endif diff --git a/cmake.config.h.in b/cmake.config.h.in index 1f4c50559..a3f0a5b24 100644 --- a/cmake.config.h.in +++ b/cmake.config.h.in @@ -202,6 +202,9 @@ /* Define to enable ARM NEON optimizations. */ /* #undef HAVE_NEON */ +/* Define to enable MIPS MSA optimizations. */ +/* #undef HAVE_MSA */ + /* Define if OpenMP is enabled */ #cmakedefine HAVE_OPENMP diff --git a/configure.ac b/configure.ac index b89ba03d6..3ac9979af 100644 --- a/configure.ac +++ b/configure.ac @@ -234,6 +234,11 @@ if test "$have_generic_simd256" = "yes"; then fi AM_CONDITIONAL(HAVE_GENERIC_SIMD256, test "$have_generic_simd256" = "yes") +AC_ARG_ENABLE(msa, [AC_HELP_STRING([--enable-msa],[enable MIPS MSA optimizations])], have_msa=$enableval, have_msa=no) +if test "$have_msa" = "yes"; then + AC_DEFINE(HAVE_MSA,1,[Define to enable MIPS MSA optimizations.]) +fi +AM_CONDITIONAL(HAVE_MSA, test "$have_msa" = "yes") dnl FIXME: dnl AC_ARG_ENABLE(mips-ps, [AC_HELP_STRING([--enable-mips-ps],[enable MIPS pair-single optimizations])], have_mips_ps=$enableval, have_mips_ps=no) @@ -359,9 +364,12 @@ case "${ax_cv_c_compiler_vendor}" in fi # AVX2 + # gcc-4.8 works with -march=core-avx2, but -mavx2 is not enough. + # Later versions seem to happy with -mavx2, so try the arch one first. if test "$have_avx2" = "yes" -a "x$AVX2_CFLAGS" = x; then - AX_CHECK_COMPILER_FLAGS(-mavx2, [AVX2_CFLAGS="-mavx2"], - [AC_MSG_ERROR([Need a version of gcc with -mavx2])]) + AX_CHECK_COMPILER_FLAGS(-march=core-avx2, [AVX2_CFLAGS="-march=core-avx2"], + [AX_CHECK_COMPILER_FLAGS(-mavx2, [AVX2_CFLAGS="-mavx2"], + [AC_MSG_ERROR([Need a version of gcc with either -march=core-avx2 or -mavx2])])]) AX_CHECK_COMPILER_FLAGS(-mfma, [AVX2_CFLAGS="$AVX2_CFLAGS -mfma"], [AC_MSG_WARN([Need a version of gcc with -mfma (harmless for icc)])]) fi @@ -411,6 +419,11 @@ case "${ax_cv_c_compiler_vendor}" in [AC_MSG_ERROR([Need a version of gcc with -mvsx])]) fi + if test "$have_msa" = "yes" -a "x$MSA_CFLAGS" = x; then + AX_CHECK_COMPILER_FLAGS(-mmsa, [MSA_CFLAGS="-mmsa"], + [AC_MSG_ERROR([Need a version of gcc with -mmsa])]) + fi + dnl FIXME: dnl elif test "$have_mips_ps" = "yes"; then dnl # Just punt here and use only new 4.2 compiler :( @@ -471,6 +484,7 @@ AC_SUBST(KCVI_CFLAGS) AC_SUBST(ALTIVEC_CFLAGS) AC_SUBST(VSX_CFLAGS) AC_SUBST(NEON_CFLAGS) +AC_SUBST(MSA_CFLAGS) dnl add stack alignment CFLAGS if so requested if test "$with_incoming_stack_boundary"x != "no"x; then @@ -766,6 +780,7 @@ AC_CONFIG_FILES([ dft/simd/altivec/Makefile dft/simd/vsx/Makefile dft/simd/neon/Makefile + dft/simd/msa/Makefile dft/simd/generic-simd128/Makefile dft/simd/generic-simd256/Makefile @@ -786,6 +801,7 @@ AC_CONFIG_FILES([ rdft/simd/altivec/Makefile rdft/simd/vsx/Makefile rdft/simd/neon/Makefile + rdft/simd/msa/Makefile rdft/simd/generic-simd128/Makefile rdft/simd/generic-simd256/Makefile diff --git a/dft/codelet-dft.h b/dft/codelet-dft.h index b78e135c8..e419a6c38 100644 --- a/dft/codelet-dft.h +++ b/dft/codelet-dft.h @@ -106,6 +106,7 @@ extern const solvtab X(solvtab_dft_kcvi); extern const solvtab X(solvtab_dft_altivec); extern const solvtab X(solvtab_dft_vsx); extern const solvtab X(solvtab_dft_neon); +extern const solvtab X(solvtab_dft_msa); extern const solvtab X(solvtab_dft_generic_simd128); extern const solvtab X(solvtab_dft_generic_simd256); diff --git a/dft/conf.c b/dft/conf.c index d0951de5d..b264c1db9 100644 --- a/dft/conf.c +++ b/dft/conf.c @@ -79,6 +79,10 @@ void X(dft_conf_standard)(planner *p) if (X(have_simd_neon)()) X(solvtab_exec)(X(solvtab_dft_neon), p); #endif +#if HAVE_MSA + if (X(have_simd_msa)()) + X(solvtab_exec)(X(solvtab_dft_msa), p); +#endif #if HAVE_GENERIC_SIMD128 X(solvtab_exec)(X(solvtab_dft_generic_simd128), p); #endif diff --git a/dft/simd/Makefile.am b/dft/simd/Makefile.am index 315d74474..fcb8e40e8 100644 --- a/dft/simd/Makefile.am +++ b/dft/simd/Makefile.am @@ -1,4 +1,4 @@ AM_CPPFLAGS = -I $(top_srcdir) -SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256 +SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon msa generic-simd128 generic-simd256 EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h \ t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk diff --git a/dft/simd/msa/Makefile.am b/dft/simd/msa/Makefile.am new file mode 100644 index 000000000..3bf566935 --- /dev/null +++ b/dft/simd/msa/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(MSA_CFLAGS) +SIMD_HEADER=simd-support/simd-msa.h + +include $(top_srcdir)/dft/simd/codlist.mk +include $(top_srcdir)/dft/simd/simd.mk + +if HAVE_MSA + +BUILT_SOURCES = $(EXTRA_DIST) +noinst_LTLIBRARIES = libdft_msa_codelets.la +libdft_msa_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/doc/install.texi b/doc/install.texi index 6ccac20b2..00911a303 100644 --- a/doc/install.texi +++ b/doc/install.texi @@ -199,6 +199,7 @@ of the time). @xref{Cycle Counters}. @code{--enable-altivec} (single), @code{--enable-vsx} (single, double), @code{--enable-neon} (single, double on aarch64), +@code{--enable-msa} (single, double on mips), @code{--enable-generic-simd128}, and @code{--enable-generic-simd256}: diff --git a/doc/intro.texi b/doc/intro.texi index b20d867f7..28ee6b0a1 100644 --- a/doc/intro.texi +++ b/doc/intro.texi @@ -18,8 +18,7 @@ transform (DFT) and various special cases thereof. @item FFTW supports arbitrary multi-dimensional data. -@item FFTW supports the SSE, SSE2, AVX, AVX2, AVX512, KCVI, Altivec, VSX, and - NEON vector instruction sets. +@item FFTW supports the SSE, SSE2, AVX, AVX2, AVX512, KCVI, Altivec, VSX, NEON and MSA vector instruction sets. @item FFTW includes parallel (multi-threaded) transforms for shared-memory systems. diff --git a/doc/other.texi b/doc/other.texi index b2d75ce9c..f21efacce 100644 --- a/doc/other.texi +++ b/doc/other.texi @@ -16,8 +16,9 @@ special operations supported by some processors to perform a single operation on several numbers (usually 2 or 4) simultaneously. SIMD floating-point instructions are available on several popular CPUs: SSE/SSE2/AVX/AVX2/AVX512/KCVI on some x86/x86-64 processors, AltiVec and -VSX on some POWER/PowerPCs, NEON on some ARM models. FFTW can be -compiled to support the SIMD instructions on any of these systems. +VSX on some POWER/PowerPCs, NEON on some ARM models, MSA on some MIPS +models. FFTW can be compiled to support the SIMD instructions on any of +these systems. @cindex SIMD @cindex SSE @cindex SSE2 diff --git a/kernel/ifftw.h b/kernel/ifftw.h index 0733e7566..4c052be13 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -103,7 +103,7 @@ extern void X(extract_reim)(int sign, R *c, R **r, R **i); defined(HAVE_AVX2) || defined(HAVE_AVX512) || \ defined(HAVE_KCVI) || \ defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \ - defined(HAVE_MIPS_PS) || \ + defined(HAVE_MIPS_PS) || defined(HAVE_MSA) || \ defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256) #define HAVE_SIMD 1 #else @@ -119,6 +119,7 @@ extern int X(have_simd_avx512)(void); extern int X(have_simd_altivec)(void); extern int X(have_simd_vsx)(void); extern int X(have_simd_neon)(void); +extern int X(have_simd_msa)(void); /* forward declarations */ typedef struct problem_s problem; diff --git a/rdft/codelet-rdft.h b/rdft/codelet-rdft.h index 789040f65..0b5b8d61f 100644 --- a/rdft/codelet-rdft.h +++ b/rdft/codelet-rdft.h @@ -145,6 +145,7 @@ extern const solvtab X(solvtab_rdft_kcvi); extern const solvtab X(solvtab_rdft_altivec); extern const solvtab X(solvtab_rdft_vsx); extern const solvtab X(solvtab_rdft_neon); +extern const solvtab X(solvtab_rdft_msa); extern const solvtab X(solvtab_rdft_generic_simd128); extern const solvtab X(solvtab_rdft_generic_simd256); diff --git a/rdft/conf.c b/rdft/conf.c index 5fe8d665f..bb656b659 100644 --- a/rdft/conf.c +++ b/rdft/conf.c @@ -96,6 +96,10 @@ void X(rdft_conf_standard)(planner *p) if (X(have_simd_neon)()) X(solvtab_exec)(X(solvtab_rdft_neon), p); #endif +#if HAVE_MSA + if (X(have_simd_msa)()) + X(solvtab_exec)(X(solvtab_rdft_msa), p); +#endif #if HAVE_GENERIC_SIMD128 X(solvtab_exec)(X(solvtab_rdft_generic_simd128), p); #endif diff --git a/rdft/simd/Makefile.am b/rdft/simd/Makefile.am index 53de164f0..44c6e4883 100644 --- a/rdft/simd/Makefile.am +++ b/rdft/simd/Makefile.am @@ -1,4 +1,4 @@ AM_CPPFLAGS = -I $(top_srcdir) -SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256 +SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon msa generic-simd128 generic-simd256 EXTRA_DIST = hc2cbv.h hc2cfv.h codlist.mk simd.mk diff --git a/rdft/simd/msa/Makefile.am b/rdft/simd/msa/Makefile.am new file mode 100644 index 000000000..d23d60212 --- /dev/null +++ b/rdft/simd/msa/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(MSA_CFLAGS) +SIMD_HEADER=simd-support/simd-msa.h + +include $(top_srcdir)/rdft/simd/codlist.mk +include $(top_srcdir)/rdft/simd/simd.mk + +if HAVE_MSA + +noinst_LTLIBRARIES = librdft_msa_codelets.la +BUILT_SOURCES = $(EXTRA_DIST) +librdft_msa_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/simd-support/Makefile.am b/simd-support/Makefile.am index 26db46e93..fd20bb46f 100644 --- a/simd-support/Makefile.am +++ b/simd-support/Makefile.am @@ -11,5 +11,6 @@ avx512.c simd-avx512.h \ kcvi.c simd-kcvi.h \ altivec.c simd-altivec.h vsx.c simd-vsx.h \ neon.c simd-neon.h \ +msa.c simd-msa.h \ simd-generic128.h simd-generic256.h diff --git a/simd-support/msa.c b/simd-support/msa.c new file mode 100644 index 000000000..630d17456 --- /dev/null +++ b/simd-support/msa.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2003, 2007-14 Matteo Frigo + * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "kernel/ifftw.h" + +#if HAVE_MSA + +/* check for an environment where signals are known to work */ +#if defined(unix) || defined(linux) + # include + # include + + static jmp_buf jb; + + static void sighandler(int x) + { + UNUSED(x); + longjmp(jb, 1); + } + + static int msa_works(void) + { + void (*oldsig)(int); + oldsig = signal(SIGILL, sighandler); + if (setjmp(jb)) { + signal(SIGILL, oldsig); + return 0; + } else { + /* asm volatile ("xor.v $w0, $w0, $w0"); */ + asm volatile (".long 0x7860001e"); + signal(SIGILL, oldsig); + return 1; + } + } + + int X(have_simd_msa)(void) + { + static int init = 0, res; + + if (!init) { + res = msa_works(); + init = 1; + } + return res; + } + +#else +/* don't know how to autodetect MSA; assume it is present */ + int X(have_simd_msa)(void) + { + return 1; + } +#endif + +#endif diff --git a/simd-support/simd-common.h b/simd-support/simd-common.h index ad2c96fa1..46caa118d 100644 --- a/simd-support/simd-common.h +++ b/simd-support/simd-common.h @@ -44,6 +44,9 @@ # define ALIGNMENT 16 /* Alignment for the LD/ST macros */ # endif # define ALIGNMENTA 64 /* Alignment for the LDA/STA macros */ +#elif defined(HAVE_MSA) +# define ALIGNMENT 16 /* Alignment for the LD/ST macros */ +# define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */ #elif defined(HAVE_GENERIC_SIMD256) # if defined(FFTW_SINGLE) # define ALIGNMENT 8 diff --git a/simd-support/simd-msa.h b/simd-support/simd-msa.h new file mode 100644 index 000000000..3d3a24b79 --- /dev/null +++ b/simd-support/simd-msa.h @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2003, 2007-14 Matteo Frigo + * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD) +# error "MSA only works in single/double precision" +#endif + +#define SIMD_SUFFIX _msa + +#ifdef FFTW_SINGLE +# define DS(d,s) s /* single-precision option */ +# define SUFF(name) name ## _w +#else +# define DS(d,s) d /* double-precision option */ +# define SUFF(name) name ## _d +#endif + +#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */ +#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2)) +#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK + +#if defined(__GNUC__) && !defined(FFTW_SINGLE) && !defined(__mips_msa) +# error "compiling simd-msa.h in double precision without -mmsa" +#elif defined(__GNUC__) && defined(FFTW_SINGLE) && !defined(__mips_msa) +# error "compiling simd-msa.h in single precision without -mmsa" +#endif + +#include + +typedef DS(v2f64, v4f32) V; +#define VADD SUFF(__builtin_msa_fadd) +#define VSUB SUFF(__builtin_msa_fsub) +#define VMUL SUFF(__builtin_msa_fmul) + +#define LDK(x) x + +static inline V VDUPL(V x) +{ +#ifdef FFTW_SINGLE + /* __builtin_shuffle(x, (v4i32){0, 0, 2, 2}); */ + return (V)__builtin_msa_shf_w((v4i32)x, 0xa0); +#else + /* __builtin_shuffle(x, (v4i32){0, 1, 0, 1}) */ + return (V)__builtin_msa_shf_w((v4i32)x, 0x44); +#endif +} + +static inline V VDUPH(V x) +{ +#ifdef FFTW_SINGLE + /* __builtin_shuffle(x, (v4i32){1, 1, 3, 3}); */ + return (V)__builtin_msa_shf_w((v4i32)x, 0xf5); +#else + /* __builtin_shuffle(x, (v4i32){2, 3, 2, 3}) */ + return (V)__builtin_msa_shf_w((v4i32)x, 0xee); +#endif +} + +#ifdef FFTW_SINGLE +static inline V MSA_FILL(float val) +{ + return (V)__builtin_msa_fill_w(*(int*)&val); +} +#else +static inline V MSA_FILL(double val) +{ + return (V)__builtin_msa_fill_d(*(long long int*)&val); +} +#endif + +#define DVK(var, val) V var = MSA_FILL(val) + +static inline V LDA(const R *x, INT ivs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + (void)ivs; /* UNUSED */ + return *(const V *)x; +} + +static inline void STA(R *x, V v, INT ovs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + (void)ovs; /* UNUSED */ + *(V *)x = v; +} + +#ifdef FFTW_SINGLE + +static inline V LD(const R *x, INT ivs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + V resl = (V)__builtin_msa_ld_w(x, 0); + V resh = (V)__builtin_msa_ld_w(x + ivs, 0); + return (V)__builtin_msa_ilvr_d((v2i64)resh, (v2i64)resl); +} + +static inline void ST(R *x, V v, INT ovs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + *(x + ovs ) = v[2]; + *(x + ovs + 1) = v[3]; + *(x ) = v[0]; + *(x + 1) = v[1]; +} + +#else /* ! FFTW_SINGLE */ +# define LD LDA +# define ST STA +#endif + +#ifdef FFTW_SINGLE +#define STM2 ST +#define STN2(x, v0, v1, ovs) /* nop */ + +#define UNPCKL(a, b) (V)__builtin_msa_ilvr_w((v4i32)a, (v4i32)b) +#define UNPCKH(a, b) (V)__builtin_msa_ilvl_w((v4i32)a, (v4i32)b) + +# define STN4(x, v0, v1, v2, v3, ovs) \ +{ \ + V xxx0, xxx1, xxx2, xxx3; \ + xxx0 = UNPCKL(v0, v2); \ + xxx1 = UNPCKH(v0, v2); \ + xxx2 = UNPCKL(v1, v3); \ + xxx3 = UNPCKH(v1, v3); \ + STA(x, UNPCKL(xxx0, xxx2), 0, 0); \ + STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \ + STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \ + STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \ +} + +#define STM4(x, v, ovs, aligned_like) /* no-op */ + +#else +/* FFTW_DOUBLE */ + +#define STM2 STA +#define STN2(x, v0, v1, ovs) /* nop */ + +static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + *(x) = v[0]; + *(x+ovs) = v[1]; +} +# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */ +#endif + +static inline V FLIP_RI(V x) +{ +#ifdef FFTW_SINGLE + /* __builtin_shuffle(x, (v4i32){1, 0, 3, 2}); */ + return (V)__builtin_msa_shf_w((v4i32)x, 0xb1); +#else + /* __builtin_shuffle(x, (v4i32){2, 3, 0, 1}) */ + return (V)__builtin_msa_shf_w((v4i32)x, 0x4e); +#endif +} + +static inline V VCONJ(V x) +{ +#ifdef FFTW_SINGLE + static const v16u8 pm = {0,0,0,0,0,0,0,0x80,0,0,0,0,0,0,0,0x80}; + return (V)__builtin_msa_xor_v((v16u8)x, pm); +#else + static const v16u8 pm = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x80}; + return (V)__builtin_msa_xor_v((v16u8)x, pm); +#endif +} + +static inline V VBYI(V x) +{ + return FLIP_RI(VCONJ(x)); +} + +#ifdef FFTW_SINGLE +# define VLIT(x0, x1) {x0, x1, x0, x1} +#else +# define VLIT(x0, x1) {x0, x1} +#endif + +/* FMA support */ +#ifdef MIPS_MSA_FMA +# define VFMA(a, b, c) SUFF(__builtin_msa_fmadd)(a, b, c) +# define VFNMS(a, b, c) SUFF(__builtin_msa_fmsub)(a, b, c) +# define VFMS(a, b, c) (-SUFF(__builtin_msa_fmsub)(a, b, c)) +#else +# define VFMA(a, b, c) VADD(c, VMUL(a, b)) +# define VFNMS(a, b, c) VSUB(c, VMUL(a, b)) +# define VFMS(a, b, c) VSUB(VMUL(a, b), c) +#endif + +static inline V VFMAI(V b, V c) +{ + static const V mp = VLIT(-1.0, 1.0); + return VFMA(FLIP_RI(b), mp, c); +} + +static inline V VFNMSI(V b, V c) +{ + static const V mp = VLIT(-1.0, 1.0); + return VFNMS(FLIP_RI(b), mp, c); +} + +static inline V VFMACONJ(V b, V c) +{ + static const V pm = VLIT(1.0, -1.0); + return VFMA(b, pm, c); +} + +static inline V VFNMSCONJ(V b, V c) +{ + static const V pm = VLIT(1.0, -1.0); + return VFNMS(b, pm, c); +} + +static inline V VFMSCONJ(V b, V c) +{ + return VSUB(VCONJ(b), c); +} + +static inline V VZMUL(V tx, V sr) +{ + V tr = VDUPL(tx); + V ti = VDUPH(tx); + tr = VMUL(sr, tr); + sr = VBYI(sr); + return VFMA(ti, sr, tr); +} + +static inline V VZMULJ(V tx, V sr) +{ + V tr = VDUPL(tx); + V ti = VDUPH(tx); + tr = VMUL(sr, tr); + sr = VBYI(sr); + return VFNMS(ti, sr, tr); +} + +static inline V VZMULI(V tx, V sr) +{ + V tr = VDUPL(tx); + V ti = VDUPH(tx); + ti = VMUL(ti, sr); + sr = VBYI(sr); + return VFMS(tr, sr, ti); +} + +static inline V VZMULIJ(V tx, V sr) +{ + V tr = VDUPL(tx); + V ti = VDUPH(tx); + ti = VMUL(ti, sr); + sr = VBYI(sr); + return VFMA(tr, sr, ti); +} + +/* twiddle storage #1: compact, slower */ +#ifdef FFTW_SINGLE +# define VTW1(v,x) \ + {TW_CEXP, v, x}, {TW_CEXP, v+1, x} +static inline V BYTW1(const R *t, V sr) +{ + return VZMUL(LDA(t, 2, t), sr); +} +static inline V BYTWJ1(const R *t, V sr) +{ + return VZMULJ(LDA(t, 2, t), sr); +} +#else /* !FFTW_SINGLE */ +# define VTW1(v,x) {TW_CEXP, v, x} +static inline V BYTW1(const R *t, V sr) +{ + V tx = LD(t, 1, t); + return VZMUL(tx, sr); +} +static inline V BYTWJ1(const R *t, V sr) +{ + V tx = LD(t, 1, t); + return VZMULJ(tx, sr); +} +#endif +#define TWVL1 (VL) + +/* twiddle storage #2: twice the space, faster (when in cache) */ +#ifdef FFTW_SINGLE +# define VTW2(v,x) \ + {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x} +#else /* !FFTW_SINGLE */ +# define VTW2(v,x) \ + {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x} +#endif +#define TWVL2 (2 * VL) +static inline V BYTW2(const R *t, V sr) +{ + const V *twp = (const V *)t; + V si = FLIP_RI(sr); + V tr = twp[0], ti = twp[1]; + return VFMA(tr, sr, VMUL(ti, si)); +} +static inline V BYTWJ2(const R *t, V sr) +{ + const V *twp = (const V *)t; + V si = FLIP_RI(sr); + V tr = twp[0], ti = twp[1]; + return VFNMS(ti, si, VMUL(tr, sr)); +} + +/* twiddle storage #3 */ +#ifdef FFTW_SINGLE +# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x} +# define TWVL3 (VL) +#else +# define VTW3(v,x) VTW1(v,x) +# define TWVL3 TWVL1 +#endif + +/* twiddle storage for split arrays */ +#ifdef FFTW_SINGLE +# define VTWS(v,x) \ + {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x} +#else +# define VTWS(v,x) \ + {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x} +#endif +#define TWVLS (2 * VL) + +#define VLEAVE() /* nothing */ + +#include "simd-common.h"