diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index c8ef5a11..42e5e7ff 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -18,6 +18,7 @@
 // public header gives access to f_opts, f_spread_opts, f_plan...
 // (and clobbers FINUFFT* macros; watch out!)
 #include <finufft.h>
+#include <memory>
 
 // --------------- Private data types for compilation in either prec ---------
 // Devnote: must match those in relevant prec of public finufft.h interface!
@@ -89,7 +90,7 @@ inline constexpr BIGINT MAX_NF = BIGINT(1e12);
 inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
 
 // -------------- Math consts (not in math.h) and useful math macros ----------
-#include <math.h>
+#include <cmath>
 
 // either-precision unit imaginary number...
 #define IMA (CPX(0.0, 1.0))
@@ -118,11 +119,11 @@ inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
 // These macros should probably be replaced by modern C++ std lib or random123.
 // (RAND_MAX is in stdlib.h)
 #include <stdlib.h>
-static inline FLT rand01() { return FLT(rand()) / FLT(RAND_MAX); }
+static inline FLT rand01 [[maybe_unused]] () { return FLT(rand()) / FLT(RAND_MAX); }
 // unif[-1,1]:
-static inline FLT randm11() { return 2 * rand01() - FLT(1); }
+static inline FLT randm11 [[maybe_unused]] () { return 2 * rand01() - FLT(1); }
 // complex unif[-1,1] for Re and Im:
-static inline CPX crandm11() { return randm11() + IMA * randm11(); }
+static inline CPX crandm11 [[maybe_unused]] () { return randm11() + IMA * randm11(); }
 
 // Thread-safe seed-carrying versions of above (x is ptr to seed)...
 // MR: we have to leave those as macros for now, as "rand_r" is deprecated
@@ -134,11 +135,17 @@ static inline CPX crandm11() { return randm11() + IMA * randm11(); }
 // complex unif[-1,1] for Re and Im:
 #define crandm11r(x) (randm11r(x) + IMA * randm11r(x))
 #else
-static inline FLT rand01r(unsigned int *x) { return FLT(rand_r(x)) / FLT(RAND_MAX); }
+static inline FLT rand01r [[maybe_unused]] (unsigned int *x) {
+  return FLT(rand_r(x)) / FLT(RAND_MAX);
+}
 // unif[-1,1]:
-static inline FLT randm11r(unsigned int *x) { return 2 * rand01r(x) - FLT(1); }
+static inline FLT randm11r [[maybe_unused]] (unsigned int *x) {
+  return 2 * rand01r(x) - FLT(1);
+}
 // complex unif[-1,1] for Re and Im:
-static inline CPX crandm11r(unsigned int *x) { return randm11r(x) + IMA * randm11r(x); }
+static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) {
+  return randm11r(x) + IMA * randm11r(x);
+}
 #endif
 
 // ----- OpenMP macros which also work when omp not present -----
@@ -147,16 +154,24 @@ static inline CPX crandm11r(unsigned int *x) { return randm11r(x) + IMA * randm1
 #ifdef _OPENMP
 #include <omp.h>
 // point to actual omp utils
-static inline int MY_OMP_GET_NUM_THREADS() { return omp_get_num_threads(); }
-static inline int MY_OMP_GET_MAX_THREADS() { return omp_get_max_threads(); }
-static inline int MY_OMP_GET_THREAD_NUM() { return omp_get_thread_num(); }
-static inline void MY_OMP_SET_NUM_THREADS(int x) { omp_set_num_threads(x); }
+static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () {
+  return omp_get_num_threads();
+}
+static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () {
+  return omp_get_max_threads();
+}
+static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () {
+  return omp_get_thread_num();
+}
+static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) {
+  omp_set_num_threads(x);
+}
 #else
 // non-omp safe dummy versions of omp utils...
-static inline int MY_OMP_GET_NUM_THREADS() { return 1; }
-static inline int MY_OMP_GET_MAX_THREADS() { return 1; }
-static inline int MY_OMP_GET_THREAD_NUM() { return 0; }
-static inline void MY_OMP_SET_NUM_THREADS(int) {}
+static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; }
+static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; }
+static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; }
+static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {}
 #endif
 
 // Prec-switching name macros (respond to SINGLE), used in lib & test sources
@@ -212,10 +227,17 @@ template<typename T> struct type3params {
   T X3, C3, D3, h3, gam3; // z
 };
 
-typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
-
-  int type;                     // transform type (Rokhlin naming): 1,2 or 3
-  int dim;                      // overall dimension: 1,2 or 3
+struct FINUFFT_PLAN_S { // the main plan object, fully C++
+  // These default and delete specifications just state the obvious,
+  // but are here to silence compiler warnings.
+  FINUFFT_PLAN_S() = default;
+  // Copy construction and assignent are already deleted implicitly
+  // because of the unique_ptr member.
+  FINUFFT_PLAN_S(const FINUFFT_PLAN_S &)            = delete;
+  FINUFFT_PLAN_S &operator=(const FINUFFT_PLAN_S &) = delete;
+
+  int type;            // transform type (Rokhlin naming): 1,2 or 3
+  int dim;             // overall dimension: 1,2 or 3
   int ntrans;          // how many transforms to do at once (vector or "many" mode)
   BIGINT nj;           // num of NU pts in type 1,2 (for type 3, num input x pts)
   BIGINT nk;           // number of NU freq pts (type 3 only)
@@ -258,12 +280,9 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
   FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3
 
   // other internal structs; each is C-compatible of course
-#ifndef FINUFFT_USE_DUCC0
-  FFTW_PLAN fftwPlan;
-#endif
+  std::unique_ptr<Finufft_FFT_plan<FLT>> fftPlan;
   finufft_opts opts; // this and spopts could be made ptrs
   finufft_spread_opts spopts;
-
-} FINUFFT_PLAN_S;
+};
 
 #endif // DEFS_H
diff --git a/include/finufft/fft.h b/include/finufft/fft.h
index 906f6244..bab43966 100644
--- a/include/finufft/fft.h
+++ b/include/finufft/fft.h
@@ -1,18 +1,189 @@
 #ifndef FINUFFT_INCLUDE_FINUFFT_FFT_H
 #define FINUFFT_INCLUDE_FINUFFT_FFT_H
 
+#include <vector>
+
 #ifdef FINUFFT_USE_DUCC0
-#include "ducc0/fft/fftnd_impl.h"
-// temporary hacks to allow compilation of tests that assume FFTW is used
-static inline void FFTW_FORGET_WISDOM() {}
-static inline void FFTW_CLEANUP() {}
-static inline void FFTW_CLEANUP_THREADS() {}
+#include <complex>
+
+template<typename T> class Finufft_FFT_plan {
+public:
+  [[maybe_unused]] Finufft_FFT_plan(void (*)(void *) = nullptr,
+                                    void (*)(void *) = nullptr, void * = nullptr) {}
+  [[maybe_unused]] void plan(const std::vector<int> & /*dims*/, size_t /*batchSize*/,
+                             std::complex<T> * /*ptr*/, int /*sign*/, int /*options*/,
+                             int /*nthreads*/) {}
+  [[maybe_unused]] static std::complex<T> *alloc_complex(size_t N) {
+    return new std::complex<T>[N];
+  }
+  [[maybe_unused]] static void free(std::complex<T> *ptr) { delete[] ptr; }
+
+  [[maybe_unused]] static void forget_wisdom() {}
+  [[maybe_unused]] static void cleanup() {}
+  [[maybe_unused]] static void cleanup_threads() {}
+};
+
 #else
-#include "fftw_defs.h"
+
+//clang-format off
+#include <complex>
+#include <fftw3.h> // (after complex) needed so can typedef FFTW_CPX
+//clang-format on
+#include <mutex>
+
+template<typename T> class Finufft_FFT_plan {};
+
+template<> struct Finufft_FFT_plan<float> {
+private:
+  static std::mutex &mut() {
+    static std::mutex mut_;
+    return mut_;
+  }
+  fftwf_plan plan_;
+
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *lock_data;
+  void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); }
+  void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); }
+
+public:
+  [[maybe_unused]] Finufft_FFT_plan(void (*fftw_lock_fun_)(void *)   = nullptr,
+                                    void (*fftw_unlock_fun_)(void *) = nullptr,
+                                    void *lock_data_                 = nullptr)
+      : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_),
+        lock_data(lock_data_) {
+    lock();
+#ifdef _OPENMP
+    static bool initialized = false;
+    if (!initialized) {
+      fftwf_init_threads();
+      initialized = true;
+    }
+#endif
+    unlock();
+  }
+  [[maybe_unused]] ~Finufft_FFT_plan() {
+    lock();
+    fftwf_destroy_plan(plan_);
+    unlock();
+  }
+
+  void plan
+      [[maybe_unused]] (const std::vector<int> &dims, size_t batchSize,
+                        std::complex<float> *ptr, int sign, int options, int nthreads) {
+    uint64_t nf = 1;
+    for (auto i : dims) nf *= i;
+    lock();
+#ifdef _OPENMP
+    fftwf_plan_with_nthreads(nthreads);
+#endif
+    plan_ = fftwf_plan_many_dft(int(dims.size()), dims.data(), int(batchSize),
+                                reinterpret_cast<fftwf_complex *>(ptr), nullptr, 1,
+                                int(nf), reinterpret_cast<fftwf_complex *>(ptr), nullptr,
+                                1, int(nf), sign, unsigned(options));
+    unlock();
+  }
+  static std::complex<float> *alloc_complex [[maybe_unused]] (size_t N) {
+    return reinterpret_cast<std::complex<float> *>(fftwf_alloc_complex(N));
+  }
+  static void free [[maybe_unused]] (std::complex<float> *ptr) {
+    if (ptr) fftwf_free(reinterpret_cast<fftwf_complex *>(ptr));
+  }
+  void execute [[maybe_unused]] () { fftwf_execute(plan_); }
+
+  static void forget_wisdom [[maybe_unused]] () { fftwf_forget_wisdom(); }
+  static void cleanup [[maybe_unused]] () { fftwf_cleanup(); }
+  static void cleanup_threads [[maybe_unused]] () {
+#ifdef _OPENMP
+    fftwf_cleanup_threads();
 #endif
+  }
+};
+
+template<> struct Finufft_FFT_plan<double> {
+private:
+  static std::mutex &mut() {
+    static std::mutex mut_;
+    return mut_;
+  }
+  fftw_plan plan_;
+
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *lock_data;
+  void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); }
+  void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); }
+
+public:
+  [[maybe_unused]] Finufft_FFT_plan(void (*fftw_lock_fun_)(void *)   = nullptr,
+                                    void (*fftw_unlock_fun_)(void *) = nullptr,
+                                    void *lock_data_                 = nullptr)
+      : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_),
+        lock_data(lock_data_) {
+    lock();
+#ifdef _OPENMP
+    static bool initialized = false;
+    if (!initialized) {
+      fftw_init_threads();
+      initialized = true;
+    }
+#endif
+    unlock();
+  }
+  [[maybe_unused]] ~Finufft_FFT_plan() {
+    lock();
+    fftw_destroy_plan(plan_);
+    unlock();
+  }
+
+  void plan
+      [[maybe_unused]] (const std::vector<int> &dims, size_t batchSize,
+                        std::complex<double> *ptr, int sign, int options, int nthreads) {
+    uint64_t nf = 1;
+    for (auto i : dims) nf *= i;
+    lock();
+#ifdef _OPENMP
+    fftw_plan_with_nthreads(nthreads);
+#endif
+    plan_ = fftw_plan_many_dft(int(dims.size()), dims.data(), int(batchSize),
+                               reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, int(nf),
+                               reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, int(nf),
+                               sign, unsigned(options));
+    unlock();
+  }
+  static std::complex<double> *alloc_complex [[maybe_unused]] (size_t N) {
+    return reinterpret_cast<std::complex<double> *>(fftw_alloc_complex(N));
+  }
+  static void free [[maybe_unused]] (std::complex<double> *ptr) {
+    fftw_free(reinterpret_cast<fftw_complex *>(ptr));
+  }
+  void execute [[maybe_unused]] () { fftw_execute(plan_); }
+
+  static void forget_wisdom [[maybe_unused]] () { fftw_forget_wisdom(); }
+  static void cleanup [[maybe_unused]] () { fftw_cleanup(); }
+  static void cleanup_threads [[maybe_unused]] () {
+#ifdef _OPENMP
+    fftw_cleanup_threads();
+#endif
+  }
+};
+
+#endif
+
 #include <finufft/defs.h>
 
-int *gridsize_for_fft(FINUFFT_PLAN p);
+static inline void finufft_fft_forget_wisdom [[maybe_unused]] () {
+  Finufft_FFT_plan<FLT>::forget_wisdom();
+}
+static inline void finufft_fft_cleanup [[maybe_unused]] () {
+  Finufft_FFT_plan<FLT>::cleanup();
+}
+static inline void finufft_fft_cleanup_threads [[maybe_unused]] () {
+  Finufft_FFT_plan<FLT>::cleanup_threads();
+}
+
+std::vector<int> gridsize_for_fft(FINUFFT_PLAN p);
 void do_fft(FINUFFT_PLAN p);
 
 #endif // FINUFFT_INCLUDE_FINUFFT_FFT_H
diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h
deleted file mode 100644
index 39f8d8c7..00000000
--- a/include/finufft/fftw_defs.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// all FFTW-related private FINUFFT headers
-
-#ifndef FFTW_DEFS_H
-#define FFTW_DEFS_H
-
-// Here we define typedefs and MACROS to switch between single and double
-// precision library compilation, which need different FFTW command symbols.
-// Barnett simplified via FFTWIFY, 6/7/22.
-
-#include <fftw3.h> // (after complex.h) needed so can typedef FFTW_CPX
-
-// precision-switching names for interfaces to FFTW...
-#ifdef SINGLE
-// macro to prepend fftw_ (for double) or fftwf_ (for single) to a string
-// without a space. The 2nd level of indirection is needed for safety, see:
-// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
-#define FFTWIFY_UNSAFE(x) fftwf_##x
-#else
-#define FFTWIFY_UNSAFE(x) fftw_##x
-#endif
-#define FFTWIFY(x)         FFTWIFY_UNSAFE(x)
-// now use this tool (note we replaced typedefs v<=2.0.4, in favor of macros):
-#define FFTW_CPX           FFTWIFY(complex)
-#define FFTW_PLAN          FFTWIFY(plan)
-#define FFTW_ALLOC_CPX     FFTWIFY(alloc_complex)
-#define FFTW_PLAN_MANY_DFT FFTWIFY(plan_many_dft)
-#define FFTW_EX            FFTWIFY(execute)
-#define FFTW_DE            FFTWIFY(destroy_plan)
-#define FFTW_FR            FFTWIFY(free)
-#define FFTW_FORGET_WISDOM FFTWIFY(forget_wisdom)
-#define FFTW_CLEANUP       FFTWIFY(cleanup)
-// the following OMP switch could be done in the src code instead...
-#ifdef _OPENMP
-#define FFTW_INIT            FFTWIFY(init_threads)
-#define FFTW_PLAN_TH         FFTWIFY(plan_with_nthreads)
-#define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads)
-#else
-// no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls...
-#define FFTW_INIT()
-#define FFTW_PLAN_TH(x)
-#define FFTW_CLEANUP_THREADS()
-#endif
-
-#endif // FFTW_DEFS_H
diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp
index 90055a36..b5030a85 100644
--- a/perftest/guru_timing_test.cpp
+++ b/perftest/guru_timing_test.cpp
@@ -145,9 +145,9 @@ int main(int argc, char *argv[])
   }
 
   // Andrea found the following are needed to get reliable independent timings:
-  FFTW_CLEANUP();
-  FFTW_CLEANUP_THREADS();
-  FFTW_FORGET_WISDOM();
+  finufft_fft_cleanup();
+  finufft_fft_cleanup_threads();
+  finufft_fft_forget_wisdom();
   // std::this_thread::sleep_for(std::chrono::seconds(1));
   sleep(tsleep);
 
diff --git a/src/fft.cpp b/src/fft.cpp
index 1b801a8b..bb7e3244 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -3,31 +3,23 @@
 
 using namespace std;
 
-int *gridsize_for_fft(FINUFFT_PLAN p) {
+#ifdef FINUFFT_USE_DUCC0
+#include "ducc0/fft/fftnd_impl.h"
+#endif
+
+std::vector<int> gridsize_for_fft(FINUFFT_PLAN p) {
   // local helper func returns a new int array of length dim, extracted from
   // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
-  int *nf;
-  if (p->dim == 1) {
-    nf    = new int[1];
-    nf[0] = (int)p->nf1;
-  } else if (p->dim == 2) {
-    nf    = new int[2];
-    nf[0] = (int)p->nf2;
-    nf[1] = (int)p->nf1;
-  } // fftw enforced row major ordering, ie dims are backwards ordered
-  else {
-    nf    = new int[3];
-    nf[0] = (int)p->nf3;
-    nf[1] = (int)p->nf2;
-    nf[2] = (int)p->nf1;
-  }
-  return nf;
+  if (p->dim == 1) return {(int)p->nf1};
+  if (p->dim == 2) return {(int)p->nf2, (int)p->nf1};
+  // if (p->dim == 3)
+  return {(int)p->nf3, (int)p->nf2, (int)p->nf1};
 }
 
 void do_fft(FINUFFT_PLAN p) {
 #ifdef FINUFFT_USE_DUCC0
   size_t nthreads = min<size_t>(MY_OMP_GET_MAX_THREADS(), p->opts.nthreads);
-  int *ns         = gridsize_for_fft(p);
+  const auto ns   = gridsize_for_fft(p);
   vector<size_t> arrdims, axes;
   arrdims.push_back(size_t(p->batchSize));
   arrdims.push_back(size_t(ns[0]));
@@ -60,8 +52,10 @@ void do_fft(FINUFFT_PLAN p) {
     else {
       size_t y_lo = size_t((p->ms + 1) / 2);
       size_t y_hi = size_t(ns[1] - p->ms / 2);
-      auto sub1   = ducc0::subarray(data, {{}, {}, {0, y_lo}});
-      auto sub2   = ducc0::subarray(data, {{}, {}, {y_hi, ducc0::MAXIDX}});
+      // the next line is analogous to the Python statement "sub1 = data[:, :, :y_lo]"
+      auto sub1 = ducc0::subarray(data, {{}, {}, {0, y_lo}});
+      // the next line is analogous to the Python statement "sub2 = data[:, :, y_hi:]"
+      auto sub2 = ducc0::subarray(data, {{}, {}, {y_hi, ducc0::MAXIDX}});
       if (p->type == 1) // spreading, not all parts of the output array are needed
         // do axis 2 in full
         ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads);
@@ -108,8 +102,7 @@ void do_fft(FINUFFT_PLAN p) {
     }
   }
 #endif
-  delete[] ns;
 #else
-  FFTW_EX(p->fftwPlan); // if thisBatchSize<batchSize it wastes some flops
+  p->fftPlan->execute(); // if thisBatchSize<batchSize it wastes some flops
 #endif
 }
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 9ed3f7d2..21e6db7a 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -10,12 +10,12 @@
 #include <finufft/utils_precindep.h>
 
 #include "../contrib/legendre_rule_fast.h"
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
 #include <iomanip>
 #include <iostream>
-#include <math.h>
-#include <mutex>
-#include <stdio.h>
-#include <stdlib.h>
+#include <memory>
 #include <vector>
 
 using namespace std;
@@ -85,42 +85,13 @@ Design notes for guru interface implementation:
 namespace finufft {
 namespace common {
 
-#ifndef FINUFFT_USE_DUCC0
-// Technically global state...
-// Needs to be static to avoid name collision with SINGLE/DOUBLE
-static std::mutex fftw_lock;
-
-class FFTWLockGuard {
-public:
-  FFTWLockGuard(void (*lock_fun)(void *), void (*unlock_fun)(void *), void *lock_data)
-      : unlock_fun_(unlock_fun), lock_data_(lock_data), fftw_lock_(fftw_lock) {
-    if (lock_fun)
-      lock_fun(lock_data_);
-    else
-      fftw_lock_.lock();
-  }
-  ~FFTWLockGuard() {
-    if (unlock_fun_)
-      unlock_fun_(lock_data_);
-    else
-      fftw_lock_.unlock();
-  }
-
-private:
-  void (*unlock_fun_)(void *);
-  void *lock_data_;
-  std::mutex &fftw_lock_;
-};
-
-#endif
-
 static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts,
                          BIGINT *nf)
 // Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
 // and requested number of Fourier modes ms. Returns 0 if success, else an
 // error code if nf was unreasonably big (& tell the world).
 {
-  *nf = (BIGINT)(opts.upsampfac * ms); // manner of rounding not crucial
+  *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial
   if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
   if (*nf < MAX_NF) {
     *nf = next235even(*nf);                               // expensive at huge nf
@@ -185,7 +156,7 @@ void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts,
   else
     Ssafe = max(Ssafe, 1 / X);
   // use the safe X and S...
-  FLT nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss;
+  auto nfd = FLT(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss);
   if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf
   *nf = (BIGINT)nfd;
   // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
@@ -193,8 +164,8 @@ void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts,
   if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread;
   if (*nf < MAX_NF)                                 // otherwise will fail anyway
     *nf = next235even(*nf);                         // expensive at huge nf
-  *h   = 2 * PI / *nf;                              // upsampled grid spacing
-  *gam = (FLT)*nf / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x'
+  *h   = FLT(2.0 * PI / *nf);                       // upsampled grid spacing
+  *gam = FLT(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x'
 }
 
 void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
@@ -578,6 +549,9 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
            __func__);
 
+  p->fftPlan = std::make_unique<Finufft_FFT_plan<FLT>>(
+      p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data);
+
   if ((type != 1) && (type != 2) && (type != 3)) {
     fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
     return FINUFFT_ERR_TYPE_NOTVALID;
@@ -682,21 +656,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
   //  ------------------------ types 1,2: planning needed ---------------------
   if (type == 1 || type == 2) {
 
-#ifndef FINUFFT_USE_DUCC0
     int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
                          // Note: batchSize not used since might be only 1.
-    // Now place FFTW initialization in a lock, courtesy of OMP. Makes FINUFFT
-    // thread-safe (can be called inside OMP)
-    {
-      static bool did_fftw_init = false; // the only global state of FINUFFT
-      FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun,
-                         p->opts.fftw_lock_data);
-      if (!did_fftw_init) {
-        FFTW_INIT();          // setup FFTW global state; should only do once
-        did_fftw_init = true; // ensure other FINUFFT threads don't clash
-      }
-    }
-#endif
 
     p->spopts.spread_direction = type;
 
@@ -760,11 +721,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     }
 
     timer.restart();
-#ifdef FINUFFT_USE_DUCC0
-    p->fwBatch = (CPX *)malloc(p->nf * p->batchSize * sizeof(CPX)); // the big workspace
-#else
-    p->fwBatch = (CPX *)FFTW_ALLOC_CPX(p->nf * p->batchSize); // the big workspace
-#endif
+    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace
     if (p->opts.debug)
       printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
              (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec());
@@ -777,30 +734,12 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
       return FINUFFT_ERR_ALLOC;
     }
 
-#ifndef FINUFFT_USE_DUCC0
     timer.restart(); // plan the FFTW
-    int *ns = gridsize_for_fft(p);
-    // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride,
-    // idist, ot, onembed, ostride, odist, sign, flags
-    {
-      FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun,
-                         p->opts.fftw_lock_data);
-      // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads.
-      // FIXME: Since this might override what the user wants for fftw, we'd like to
-      // set it just for our one plan and then revert to the user value.
-      // Unfortunately fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and
-      // there isn't a convenient mechanism to probe the version
-      // there is fftw_version which returns a string, but that's not compile time
-      FFTW_PLAN_TH(nthr_fft);
-      p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, (FFTW_CPX *)p->fwBatch,
-                                       NULL, 1, p->nf, (FFTW_CPX *)p->fwBatch, NULL, 1,
-                                       p->nf, p->fftSign, p->opts.fftw);
-    }
+    const auto ns = gridsize_for_fft(p);
+    p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
     if (p->opts.debug)
-      printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
+      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
              nthr_fft, timer.elapsedsec());
-    delete[] ns;
-#endif
 
   } else { // -------------------------- type 3 (no planning) ------------
 
@@ -924,13 +863,8 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT
               __func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
-#ifdef FINUFFT_USE_DUCC0
-    free(p->fwBatch);
-    p->fwBatch = (CPX *)malloc(p->nf * p->batchSize * sizeof(CPX)); // maybe big workspace
-#else
-    if (p->fwBatch) FFTW_FR(p->fwBatch);
-    p->fwBatch = (CPX *)FFTW_ALLOC_CPX(p->nf * p->batchSize); // maybe big workspace
-#endif
+    p->fftPlan->free(p->fwBatch);
+    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace
 
     // (note FFTW_ALLOC is not needed over malloc, but matches its type)
     if (p->CpBatch) free(p->CpBatch);
@@ -1245,20 +1179,9 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
   if (!p) // NULL ptr, so not a ptr to a plan, report error
     return 1;
 
-#ifdef FINUFFT_USE_DUCC0
-  free(p->fwBatch); // free the big FFTW (or t3 spread) working array
-#else
-  FFTW_FR(p->fwBatch); // free the big FFTW (or t3 spread) working array
-#endif
+  p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array
   free(p->sortIndices);
   if (p->type == 1 || p->type == 2) {
-#ifndef FINUFFT_USE_DUCC0
-    {
-      FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun,
-                         p->opts.fftw_lock_data);
-      FFTW_DE(p->fftwPlan);
-    }
-#endif
     free(p->phiHat1);
     free(p->phiHat2);
     free(p->phiHat3);
diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp
index e17cbb65..1219670a 100644
--- a/test/finufft1dmany_test.cpp
+++ b/test/finufft1dmany_test.cpp
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
   printf("\tone mode: rel err in F[%lld] of trans#%d is %.3g\n", (long long)nt1, i, err);
 
   // compare the result with FINUFFT1D1
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   CPX *F_1d1 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   CPX *Fstart;
   CPX *cstart;
@@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
   free(F_1d1);
 
   printf("test 1d2 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
 
 #pragma omp parallel
   {
@@ -148,7 +148,7 @@ int main(int argc, char *argv[]) {
   printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
 
   // check against single calls to FINUFFT1D2...
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   CPX *c_1d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
   for (BIGINT j = 0; j < ntransf; j++) {
@@ -173,7 +173,7 @@ int main(int argc, char *argv[]) {
   free(c_1d2);
 
   printf("test 1d3 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
 
 #pragma omp parallel
   {
@@ -214,7 +214,7 @@ int main(int argc, char *argv[]) {
   printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
 
   // compare the result with single calls to FINUFFT1D3...
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   CPX *f_1d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
   for (int k = 0; k < ntransf; k++) {
diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp
index fd26f919..155c71ae 100644
--- a/test/finufft2dmany_test.cpp
+++ b/test/finufft2dmany_test.cpp
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
          (long long)nt2, i, err);
 
   // compare the result with FINUFFT2D1
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   finufft_opts simpleopts = opts;
   simpleopts.debug        = 0; // don't output timing for calls of FINUFFT2D1
   simpleopts.spread_debug = 0;
@@ -134,7 +134,7 @@ int main(int argc, char *argv[]) {
     for (BIGINT m = 0; m < N * ntransf; ++m) F[m] = crandm11r(&se);
   }
 
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   timer.restart();
   ier = FINUFFT2D2MANY(ntransf, M, x, y, c, isign, tol, N1, N2, F, &opts);
   ti  = timer.elapsedsec();
@@ -145,7 +145,7 @@ int main(int argc, char *argv[]) {
     printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
            ntransf, (long long)N1, (long long)N2, (long long)M, ti, ntransf * M / ti);
 
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   i         = ntransf - 1; // choose a data to check
   BIGINT jt = M / 2;       // check arbitrary choice of one targ pt
   CPX ct    = CPX(0, 0);
@@ -182,7 +182,7 @@ int main(int argc, char *argv[]) {
   free(c_2d2);
 
   printf("test 2d3 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
 
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
@@ -231,7 +231,7 @@ int main(int argc, char *argv[]) {
   printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
 
   // compare the result with FINUFFT2D3...
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   CPX *f_2d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
   for (int k = 0; k < ntransf; ++k) {
diff --git a/test/finufft3dmany_test.cpp b/test/finufft3dmany_test.cpp
index 485f90b0..ad7c954c 100644
--- a/test/finufft3dmany_test.cpp
+++ b/test/finufft3dmany_test.cpp
@@ -103,7 +103,7 @@ int main(int argc, char *argv[]) {
          (long long)nt2, (long long)nt3, i, err);
 
   // compare the result with FINUFFT3D1
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   finufft_opts simpleopts = opts;
   simpleopts.debug        = 0; // don't output timing for calls of FINUFFT3D1
   simpleopts.spread_debug = 0;
@@ -142,7 +142,7 @@ int main(int argc, char *argv[]) {
 #pragma omp for schedule(static, TEST_RANDCHUNK)
     for (BIGINT m = 0; m < N * ntransf; ++m) F[m] = crandm11r(&se);
   }
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   timer.restart();
   ier = FINUFFT3D2MANY(ntransf, M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
   ti  = timer.elapsedsec();
@@ -171,7 +171,7 @@ int main(int argc, char *argv[]) {
   errmax = max(err, errmax);
   printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
 
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   // compare the result with FINUFFT3D2...
   CPX *c_3d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
   free(c_3d2);
 
   printf("test 3d3 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
@@ -250,7 +250,7 @@ int main(int argc, char *argv[]) {
   errmax = max(err, errmax);
   printf("\t one targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
 
-  FFTW_FORGET_WISDOM();
+  finufft_fft_forget_wisdom();
   // compare the result with FINUFFT3D3...
   CPX *f_3d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();