From fce4dad356b0d49d76c5ecfd8deda207ed53f0d8 Mon Sep 17 00:00:00 2001
From: Evidence John <mail@evi.fun>
Date: Tue, 1 Jun 2021 15:43:45 +0800
Subject: [PATCH] Support MSA SIMD for MIPS

--enable-msa now works in single and double precision for MIPS.
Tested on both 32-bit and 64-bit MIPS run in little-endian.
---
 Makefile.am                |   7 +-
 api/version.c              |   4 +
 cmake.config.h.in          |   3 +
 configure.ac               |  20 ++-
 dft/codelet-dft.h          |   1 +
 dft/conf.c                 |   4 +
 dft/simd/Makefile.am       |   2 +-
 dft/simd/msa/Makefile.am   |  13 ++
 doc/install.texi           |   1 +
 doc/intro.texi             |   3 +-
 doc/other.texi             |   5 +-
 kernel/ifftw.h             |   3 +-
 rdft/codelet-rdft.h        |   1 +
 rdft/conf.c                |   4 +
 rdft/simd/Makefile.am      |   2 +-
 rdft/simd/msa/Makefile.am  |  13 ++
 simd-support/Makefile.am   |   1 +
 simd-support/msa.c         |  73 ++++++++
 simd-support/simd-common.h |   3 +
 simd-support/simd-msa.h    | 348 +++++++++++++++++++++++++++++++++++++
 20 files changed, 501 insertions(+), 10 deletions(-)
 create mode 100644 dft/simd/msa/Makefile.am
 create mode 100644 rdft/simd/msa/Makefile.am
 create mode 100644 simd-support/msa.c
 create mode 100644 simd-support/simd-msa.h

diff --git a/Makefile.am b/Makefile.am
index eaf131cca..3d4170bdd 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -94,6 +94,11 @@ NEON_LIBS = dft/simd/neon/libdft_neon_codelets.la	\
 rdft/simd/neon/librdft_neon_codelets.la
 endif
 
+if HAVE_MSA
+MSA_LIBS = dft/simd/msa/libdft_msa_codelets.la	\
+rdft/simd/msa/librdft_msa_codelets.la
+endif
+
 if HAVE_GENERIC_SIMD128
 GENERIC_SIMD128_LIBS = dft/simd/generic-simd128/libdft_generic_simd128_codelets.la \
 rdft/simd/generic-simd128/librdft_generic_simd128_codelets.la
@@ -125,7 +130,7 @@ libfftw3@PREC_SUFFIX@_la_LIBADD =			\
 	reodft/libreodft.la				\
 	api/libapi.la					\
         $(SIMD_LIBS) $(SSE2_LIBS) $(AVX_LIBS) $(AVX_128_FMA_LIBS) \
-        $(AVX2_LIBS) $(ALTIVEC_LIBS) \
+        $(AVX2_LIBS) $(ALTIVEC_LIBS) $(MSA_LIBS) \
         $(VSX_LIBS) $(NEON_LIBS) $(KCVI_LIBS) $(AVX512_LIBS) \
         $(GENERIC_SIMD128_LIBS) $(GENERIC_SIMD256_LIBS) \
 	$(COMBINED_THREADLIBS)
diff --git a/api/version.c b/api/version.c
index 4f14de157..269488dcd 100644
--- a/api/version.c
+++ b/api/version.c
@@ -77,6 +77,10 @@ const char X(version)[] = PACKAGE "-" PACKAGE_VERSION
    "-neon"
 #endif
 
+#if HAVE_MSA
+   "-msa"
+#endif
+
 #if defined(HAVE_GENERIC_SIMD128)
    "-generic_simd128"
 #endif
diff --git a/cmake.config.h.in b/cmake.config.h.in
index 1f4c50559..a3f0a5b24 100644
--- a/cmake.config.h.in
+++ b/cmake.config.h.in
@@ -202,6 +202,9 @@
 /* Define to enable ARM NEON optimizations. */
 /* #undef HAVE_NEON */
 
+/* Define to enable MIPS MSA optimizations. */
+/* #undef HAVE_MSA */
+
 /* Define if OpenMP is enabled */
 #cmakedefine HAVE_OPENMP
 
diff --git a/configure.ac b/configure.ac
index b89ba03d6..3ac9979af 100644
--- a/configure.ac
+++ b/configure.ac
@@ -234,6 +234,11 @@ if test "$have_generic_simd256" = "yes"; then
 fi
 AM_CONDITIONAL(HAVE_GENERIC_SIMD256, test "$have_generic_simd256" = "yes")
 
+AC_ARG_ENABLE(msa, [AC_HELP_STRING([--enable-msa],[enable MIPS MSA optimizations])], have_msa=$enableval, have_msa=no)
+if test "$have_msa" = "yes"; then
+        AC_DEFINE(HAVE_MSA,1,[Define to enable MIPS MSA optimizations.])
+fi
+AM_CONDITIONAL(HAVE_MSA, test "$have_msa" = "yes")
 
 dnl FIXME:
 dnl AC_ARG_ENABLE(mips-ps, [AC_HELP_STRING([--enable-mips-ps],[enable MIPS pair-single optimizations])], have_mips_ps=$enableval, have_mips_ps=no)
@@ -359,9 +364,12 @@ case "${ax_cv_c_compiler_vendor}" in
         fi
 
         # AVX2
+        # gcc-4.8 works with -march=core-avx2, but -mavx2 is not enough.
+        # Later versions seem to happy with -mavx2, so try the arch one first.
         if test "$have_avx2" = "yes" -a "x$AVX2_CFLAGS" = x; then
-            AX_CHECK_COMPILER_FLAGS(-mavx2, [AVX2_CFLAGS="-mavx2"],
-                [AC_MSG_ERROR([Need a version of gcc with -mavx2])])
+            AX_CHECK_COMPILER_FLAGS(-march=core-avx2, [AVX2_CFLAGS="-march=core-avx2"],
+                [AX_CHECK_COMPILER_FLAGS(-mavx2, [AVX2_CFLAGS="-mavx2"],
+                    [AC_MSG_ERROR([Need a version of gcc with either -march=core-avx2 or -mavx2])])])
             AX_CHECK_COMPILER_FLAGS(-mfma, [AVX2_CFLAGS="$AVX2_CFLAGS -mfma"],
                 [AC_MSG_WARN([Need a version of gcc with -mfma (harmless for icc)])])
         fi
@@ -411,6 +419,11 @@ case "${ax_cv_c_compiler_vendor}" in
                 [AC_MSG_ERROR([Need a version of gcc with -mvsx])])
         fi
 
+        if test "$have_msa" = "yes" -a "x$MSA_CFLAGS" = x; then
+            AX_CHECK_COMPILER_FLAGS(-mmsa, [MSA_CFLAGS="-mmsa"],
+                [AC_MSG_ERROR([Need a version of gcc with -mmsa])])
+        fi
+
     dnl FIXME:
     dnl elif test "$have_mips_ps" = "yes"; then
     dnl     # Just punt here and use only new 4.2 compiler :(
@@ -471,6 +484,7 @@ AC_SUBST(KCVI_CFLAGS)
 AC_SUBST(ALTIVEC_CFLAGS)
 AC_SUBST(VSX_CFLAGS)
 AC_SUBST(NEON_CFLAGS)
+AC_SUBST(MSA_CFLAGS)
 
 dnl add stack alignment CFLAGS if so requested
 if test "$with_incoming_stack_boundary"x != "no"x; then
@@ -766,6 +780,7 @@ AC_CONFIG_FILES([
    dft/simd/altivec/Makefile
    dft/simd/vsx/Makefile
    dft/simd/neon/Makefile
+   dft/simd/msa/Makefile
    dft/simd/generic-simd128/Makefile
    dft/simd/generic-simd256/Makefile
 
@@ -786,6 +801,7 @@ AC_CONFIG_FILES([
    rdft/simd/altivec/Makefile
    rdft/simd/vsx/Makefile
    rdft/simd/neon/Makefile
+   rdft/simd/msa/Makefile
    rdft/simd/generic-simd128/Makefile
    rdft/simd/generic-simd256/Makefile
 
diff --git a/dft/codelet-dft.h b/dft/codelet-dft.h
index b78e135c8..e419a6c38 100644
--- a/dft/codelet-dft.h
+++ b/dft/codelet-dft.h
@@ -106,6 +106,7 @@ extern const solvtab X(solvtab_dft_kcvi);
 extern const solvtab X(solvtab_dft_altivec);
 extern const solvtab X(solvtab_dft_vsx);
 extern const solvtab X(solvtab_dft_neon);
+extern const solvtab X(solvtab_dft_msa);
 extern const solvtab X(solvtab_dft_generic_simd128);
 extern const solvtab X(solvtab_dft_generic_simd256);
 
diff --git a/dft/conf.c b/dft/conf.c
index d0951de5d..b264c1db9 100644
--- a/dft/conf.c
+++ b/dft/conf.c
@@ -79,6 +79,10 @@ void X(dft_conf_standard)(planner *p)
      if (X(have_simd_neon)())
 	  X(solvtab_exec)(X(solvtab_dft_neon), p);
 #endif
+#if HAVE_MSA
+     if (X(have_simd_msa)())
+	  X(solvtab_exec)(X(solvtab_dft_msa), p);
+#endif
 #if HAVE_GENERIC_SIMD128
      X(solvtab_exec)(X(solvtab_dft_generic_simd128), p);
 #endif
diff --git a/dft/simd/Makefile.am b/dft/simd/Makefile.am
index 315d74474..fcb8e40e8 100644
--- a/dft/simd/Makefile.am
+++ b/dft/simd/Makefile.am
@@ -1,4 +1,4 @@
 AM_CPPFLAGS = -I $(top_srcdir)
-SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
+SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon msa generic-simd128 generic-simd256
 EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h	\
 t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
diff --git a/dft/simd/msa/Makefile.am b/dft/simd/msa/Makefile.am
new file mode 100644
index 000000000..3bf566935
--- /dev/null
+++ b/dft/simd/msa/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(MSA_CFLAGS)
+SIMD_HEADER=simd-support/simd-msa.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_MSA
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_msa_codelets.la
+libdft_msa_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/doc/install.texi b/doc/install.texi
index 6ccac20b2..00911a303 100644
--- a/doc/install.texi
+++ b/doc/install.texi
@@ -199,6 +199,7 @@ of the time).  @xref{Cycle Counters}.
 @code{--enable-altivec} (single),
 @code{--enable-vsx} (single, double),
 @code{--enable-neon} (single, double on aarch64),
+@code{--enable-msa} (single, double on mips),
 @code{--enable-generic-simd128},
 and
 @code{--enable-generic-simd256}:
diff --git a/doc/intro.texi b/doc/intro.texi
index b20d867f7..28ee6b0a1 100644
--- a/doc/intro.texi
+++ b/doc/intro.texi
@@ -18,8 +18,7 @@ transform (DFT) and various special cases thereof.
 
 @item  FFTW supports arbitrary multi-dimensional data.
 
-@item  FFTW supports the SSE, SSE2, AVX, AVX2, AVX512, KCVI, Altivec, VSX, and
-       NEON vector instruction sets.
+@item  FFTW supports the SSE, SSE2, AVX, AVX2, AVX512, KCVI, Altivec, VSX, NEON and MSA vector instruction sets.
 
 @item  FFTW includes parallel (multi-threaded) transforms
        for shared-memory systems.
diff --git a/doc/other.texi b/doc/other.texi
index b2d75ce9c..f21efacce 100644
--- a/doc/other.texi
+++ b/doc/other.texi
@@ -16,8 +16,9 @@ special operations supported by some processors to perform a single
 operation on several numbers (usually 2 or 4) simultaneously.  SIMD
 floating-point instructions are available on several popular CPUs:
 SSE/SSE2/AVX/AVX2/AVX512/KCVI on some x86/x86-64 processors, AltiVec and
-VSX on some POWER/PowerPCs, NEON on some ARM models.  FFTW can be
-compiled to support the SIMD instructions on any of these systems.
+VSX on some POWER/PowerPCs, NEON on some ARM models, MSA on some MIPS
+models.  FFTW can be compiled to support the SIMD instructions on any of
+these systems.
 @cindex SIMD
 @cindex SSE
 @cindex SSE2
diff --git a/kernel/ifftw.h b/kernel/ifftw.h
index 0733e7566..4c052be13 100644
--- a/kernel/ifftw.h
+++ b/kernel/ifftw.h
@@ -103,7 +103,7 @@ extern void X(extract_reim)(int sign, R *c, R **r, R **i);
       defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
       defined(HAVE_KCVI) || \
       defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
-      defined(HAVE_MIPS_PS) || \
+      defined(HAVE_MIPS_PS) || defined(HAVE_MSA) || \
       defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
 #define HAVE_SIMD 1
 #else
@@ -119,6 +119,7 @@ extern int X(have_simd_avx512)(void);
 extern int X(have_simd_altivec)(void);
 extern int X(have_simd_vsx)(void);
 extern int X(have_simd_neon)(void);
+extern int X(have_simd_msa)(void);
 
 /* forward declarations */
 typedef struct problem_s problem;
diff --git a/rdft/codelet-rdft.h b/rdft/codelet-rdft.h
index 789040f65..0b5b8d61f 100644
--- a/rdft/codelet-rdft.h
+++ b/rdft/codelet-rdft.h
@@ -145,6 +145,7 @@ extern const solvtab X(solvtab_rdft_kcvi);
 extern const solvtab X(solvtab_rdft_altivec);
 extern const solvtab X(solvtab_rdft_vsx);
 extern const solvtab X(solvtab_rdft_neon);
+extern const solvtab X(solvtab_rdft_msa);
 extern const solvtab X(solvtab_rdft_generic_simd128);
 extern const solvtab X(solvtab_rdft_generic_simd256);
 
diff --git a/rdft/conf.c b/rdft/conf.c
index 5fe8d665f..bb656b659 100644
--- a/rdft/conf.c
+++ b/rdft/conf.c
@@ -96,6 +96,10 @@ void X(rdft_conf_standard)(planner *p)
      if (X(have_simd_neon)())
 	  X(solvtab_exec)(X(solvtab_rdft_neon), p);
 #endif
+#if HAVE_MSA
+     if (X(have_simd_msa)())
+	  X(solvtab_exec)(X(solvtab_rdft_msa), p);
+#endif
 #if HAVE_GENERIC_SIMD128
      X(solvtab_exec)(X(solvtab_rdft_generic_simd128), p);
 #endif
diff --git a/rdft/simd/Makefile.am b/rdft/simd/Makefile.am
index 53de164f0..44c6e4883 100644
--- a/rdft/simd/Makefile.am
+++ b/rdft/simd/Makefile.am
@@ -1,4 +1,4 @@
 
 AM_CPPFLAGS = -I $(top_srcdir)
-SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
+SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon msa generic-simd128 generic-simd256
 EXTRA_DIST = hc2cbv.h hc2cfv.h codlist.mk simd.mk
diff --git a/rdft/simd/msa/Makefile.am b/rdft/simd/msa/Makefile.am
new file mode 100644
index 000000000..d23d60212
--- /dev/null
+++ b/rdft/simd/msa/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(MSA_CFLAGS)
+SIMD_HEADER=simd-support/simd-msa.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_MSA
+
+noinst_LTLIBRARIES = librdft_msa_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_msa_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/simd-support/Makefile.am b/simd-support/Makefile.am
index 26db46e93..fd20bb46f 100644
--- a/simd-support/Makefile.am
+++ b/simd-support/Makefile.am
@@ -11,5 +11,6 @@ avx512.c simd-avx512.h \
 kcvi.c simd-kcvi.h \
 altivec.c simd-altivec.h vsx.c simd-vsx.h \
 neon.c simd-neon.h \
+msa.c simd-msa.h \
 simd-generic128.h simd-generic256.h
 
diff --git a/simd-support/msa.c b/simd-support/msa.c
new file mode 100644
index 000000000..630d17456
--- /dev/null
+++ b/simd-support/msa.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_MSA
+
+/* check for an environment where signals are known to work */
+#if defined(unix) || defined(linux)
+  # include <signal.h>
+  # include <setjmp.h>
+
+  static jmp_buf jb;
+
+  static void sighandler(int x)
+  {
+    UNUSED(x);
+    longjmp(jb, 1);
+  }
+
+  static int msa_works(void)
+  {
+    void (*oldsig)(int);
+    oldsig = signal(SIGILL, sighandler);
+    if (setjmp(jb)) {
+      signal(SIGILL, oldsig);
+      return 0;
+    } else {
+      /* asm volatile ("xor.v $w0, $w0, $w0"); */
+      asm volatile (".long 0x7860001e");
+      signal(SIGILL, oldsig);
+      return 1;
+    }
+  }
+
+  int X(have_simd_msa)(void)
+  {
+    static int init = 0, res;
+
+    if (!init) {
+      res = msa_works();
+      init = 1;
+    }
+    return res;
+  }
+
+#else
+/* don't know how to autodetect MSA; assume it is present */
+  int X(have_simd_msa)(void)
+  {
+    return 1;
+  }
+#endif
+
+#endif
diff --git a/simd-support/simd-common.h b/simd-support/simd-common.h
index ad2c96fa1..46caa118d 100644
--- a/simd-support/simd-common.h
+++ b/simd-support/simd-common.h
@@ -44,6 +44,9 @@
 #    define ALIGNMENT 16     /* Alignment for the LD/ST macros */
 #  endif
 #  define ALIGNMENTA 64   /* Alignment for the LDA/STA macros */
+#elif defined(HAVE_MSA)
+#  define ALIGNMENT  16    /* Alignment for the LD/ST macros */
+#  define ALIGNMENTA 16    /* Alignment for the LDA/STA macros */
 #elif defined(HAVE_GENERIC_SIMD256)
 #  if defined(FFTW_SINGLE)
 #    define ALIGNMENT 8
diff --git a/simd-support/simd-msa.h b/simd-support/simd-msa.h
new file mode 100644
index 000000000..3d3a24b79
--- /dev/null
+++ b/simd-support/simd-msa.h
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#  error "MSA only works in single/double precision"
+#endif
+
+#define SIMD_SUFFIX  _msa
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## _w
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## _d
+#endif
+
+#define VL DS(1,2)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(FFTW_SINGLE) && !defined(__mips_msa)
+#  error "compiling simd-msa.h in double precision without -mmsa"
+#elif defined(__GNUC__) && defined(FFTW_SINGLE) && !defined(__mips_msa)
+#  error "compiling simd-msa.h in single precision without -mmsa"
+#endif
+
+#include <msa.h>
+
+typedef DS(v2f64, v4f32) V;
+#define VADD SUFF(__builtin_msa_fadd)
+#define VSUB SUFF(__builtin_msa_fsub)
+#define VMUL SUFF(__builtin_msa_fmul)
+
+#define LDK(x) x
+
+static inline V VDUPL(V x)
+{
+#ifdef FFTW_SINGLE
+    /* __builtin_shuffle(x, (v4i32){0, 0, 2, 2}); */
+    return (V)__builtin_msa_shf_w((v4i32)x, 0xa0);
+#else
+    /* __builtin_shuffle(x, (v4i32){0, 1, 0, 1}) */
+    return (V)__builtin_msa_shf_w((v4i32)x, 0x44);
+#endif
+}
+
+static inline V VDUPH(V x)
+{
+#ifdef FFTW_SINGLE
+    /* __builtin_shuffle(x, (v4i32){1, 1, 3, 3}); */
+    return (V)__builtin_msa_shf_w((v4i32)x, 0xf5);
+#else
+    /* __builtin_shuffle(x, (v4i32){2, 3, 2, 3}) */
+    return (V)__builtin_msa_shf_w((v4i32)x, 0xee);
+#endif
+}
+
+#ifdef FFTW_SINGLE
+static inline V MSA_FILL(float val)
+{
+    return (V)__builtin_msa_fill_w(*(int*)&val);
+}
+#else
+static inline V MSA_FILL(double val)
+{
+    return (V)__builtin_msa_fill_d(*(long long int*)&val);
+}
+#endif
+
+#define DVK(var, val) V var = MSA_FILL(val)
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return *(const V *)x;
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     *(V *)x = v;
+}
+
+#ifdef FFTW_SINGLE
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+    (void)aligned_like; /* UNUSED */
+    V resl = (V)__builtin_msa_ld_w(x, 0);
+    V resh = (V)__builtin_msa_ld_w(x + ivs, 0);
+    return (V)__builtin_msa_ilvr_d((v2i64)resh, (v2i64)resl);
+}
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+    (void)aligned_like; /* UNUSED */
+    *(x + ovs    ) = v[2];
+    *(x + ovs + 1) = v[3];
+    *(x    ) = v[0];
+    *(x + 1) = v[1];
+}
+
+#else /* ! FFTW_SINGLE */
+#  define LD LDA
+#  define ST STA
+#endif
+
+#ifdef FFTW_SINGLE
+#define STM2 ST
+#define STN2(x, v0, v1, ovs) /* nop */
+
+#define UNPCKL(a, b) (V)__builtin_msa_ilvr_w((v4i32)a, (v4i32)b)
+#define UNPCKH(a, b) (V)__builtin_msa_ilvl_w((v4i32)a, (v4i32)b)
+
+#  define STN4(x, v0, v1, v2, v3, ovs)          \
+{                           \
+     V xxx0, xxx1, xxx2, xxx3;              \
+     xxx0 = UNPCKL(v0, v2);             \
+     xxx1 = UNPCKH(v0, v2);             \
+     xxx2 = UNPCKL(v1, v3);             \
+     xxx3 = UNPCKH(v1, v3);             \
+     STA(x, UNPCKL(xxx0, xxx2), 0, 0);          \
+     STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0);        \
+     STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0);    \
+     STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0);    \
+}
+
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+#else
+/* FFTW_DOUBLE */
+
+#define STM2 STA
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+    (void)aligned_like; /* UNUSED */
+    *(x) = v[0];
+    *(x+ovs) = v[1];
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+static inline V FLIP_RI(V x)
+{
+#ifdef FFTW_SINGLE
+    /* __builtin_shuffle(x, (v4i32){1, 0, 3, 2}); */
+    return (V)__builtin_msa_shf_w((v4i32)x, 0xb1);
+#else
+    /* __builtin_shuffle(x, (v4i32){2, 3, 0, 1}) */
+    return (V)__builtin_msa_shf_w((v4i32)x, 0x4e);
+#endif
+}
+
+static inline V VCONJ(V x)
+{
+#ifdef FFTW_SINGLE
+    static const v16u8 pm = {0,0,0,0,0,0,0,0x80,0,0,0,0,0,0,0,0x80};
+    return (V)__builtin_msa_xor_v((v16u8)x, pm);
+#else
+    static const v16u8 pm = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x80};
+    return (V)__builtin_msa_xor_v((v16u8)x, pm);
+#endif
+}
+
+static inline V VBYI(V x)
+{
+    return FLIP_RI(VCONJ(x));
+}
+
+#ifdef FFTW_SINGLE
+#  define VLIT(x0, x1) {x0, x1, x0, x1}
+#else
+#  define VLIT(x0, x1) {x0, x1}
+#endif
+
+/* FMA support */
+#ifdef MIPS_MSA_FMA
+#  define VFMA(a, b, c) SUFF(__builtin_msa_fmadd)(a, b, c)
+#  define VFNMS(a, b, c) SUFF(__builtin_msa_fmsub)(a, b, c)
+#  define VFMS(a, b, c) (-SUFF(__builtin_msa_fmsub)(a, b, c))
+#else
+#  define VFMA(a, b, c) VADD(c, VMUL(a, b))
+#  define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
+#  define VFMS(a, b, c) VSUB(VMUL(a, b), c)
+#endif
+
+static inline V VFMAI(V b, V c)
+{
+    static const V mp = VLIT(-1.0, 1.0);
+    return VFMA(FLIP_RI(b), mp, c);
+}
+
+static inline V VFNMSI(V b, V c)
+{
+    static const V mp = VLIT(-1.0, 1.0);
+    return VFNMS(FLIP_RI(b), mp, c);
+}
+
+static inline V VFMACONJ(V b, V c)
+{
+    static const V pm = VLIT(1.0, -1.0);
+    return VFMA(b, pm, c);
+}
+
+static inline V VFNMSCONJ(V b, V c)
+{
+    static const V pm = VLIT(1.0, -1.0);
+    return VFNMS(b, pm, c);
+}
+
+static inline V VFMSCONJ(V b, V c)
+{
+    return VSUB(VCONJ(b), c);
+}
+
+static inline V VZMUL(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    tr = VMUL(sr, tr);
+    sr = VBYI(sr);
+    return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    tr = VMUL(sr, tr);
+    sr = VBYI(sr);
+    return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    ti = VMUL(ti, sr);
+    sr = VBYI(sr);
+    return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    ti = VMUL(ti, sr);
+    sr = VBYI(sr);
+    return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#  define VTW1(v,x)  \
+  {TW_CEXP, v, x}, {TW_CEXP, v+1, x}     
+static inline V BYTW1(const R *t, V sr)
+{
+    return VZMUL(LDA(t, 2, t), sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+    return VZMULJ(LDA(t, 2, t), sr);
+}
+#else /* !FFTW_SINGLE */
+#  define VTW1(v,x) {TW_CEXP, v, x}
+static inline V BYTW1(const R *t, V sr)
+{
+    V tx = LD(t, 1, t);
+    return VZMUL(tx, sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+    V tx = LD(t, 1, t);
+    return VZMULJ(tx, sr);
+}
+#endif
+#define TWVL1 (VL)
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)                                                     \
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},   \
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else /* !FFTW_SINGLE */
+#  define VTW2(v,x)                                                     \
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+static inline V BYTW2(const R *t, V sr)
+{
+    const V *twp = (const V *)t;
+    V si = FLIP_RI(sr);
+    V tr = twp[0], ti = twp[1];
+    return VFMA(tr, sr, VMUL(ti, si));
+}
+static inline V BYTWJ2(const R *t, V sr)
+{
+    const V *twp = (const V *)t;
+    V si = FLIP_RI(sr);
+    V tr = twp[0], ti = twp[1];
+    return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+#else
+#  define VTW3(v,x) VTW1(v,x)
+#  define TWVL3 TWVL1
+#endif
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)                                                       \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)                                                       \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"