diff --git a/CHANGELOG b/CHANGELOG
index cf842bb67..ac3307c4e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,22 +1,25 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-V 2.3.0beta (7/24/24)
+V 2.3.0-rc1 (8/6/24)
 
-* python build modernized to pyproject.toml (both CPU and GPU).
-  PRs 507 (Anden, Lu, Barbone)
-* switchable FFT: either FFTW or DUCC0 (latter need no plan stage; also it is
+* Switched C++ standards from C++14 to C++17, allowing various templating
+  improvements (Barbone).
+* Python build modernized to pyproject.toml (for both CPU and GPU).
+  PR 507 (Anden, Lu, Barbone). Compiles from source for the local build.
+* Switchable FFT: either FFTW or DUCC0 (latter needs no plan stage; also it is
   used to exploit sparsity pattern to achieve FFT speedups 1-3x in 2D and 3D).
-  PR463, Martin Reinecke.
+  PR463, Martin Reinecke. Both CMake and makefile includes this DUCC0 option
+  (makefile PR511 by Barnett; CMake by Barbone).
 * ES kernel rescaled to max value 1, reduced poly degrees for upsampfac=1.25,
   cleaner Horner coefficient generation PR499 (fixes fp32 overflow issue #454).
 * Major manual acceleration of spread/interp kernels via XSIMD header-only lib,
   kernel evaluation, templating by ns with AVX-width-dependent decisions.
   Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu).
-  PRs 459, 471, 502.
-  NOTE: introduces new dependency (XSIMD), added to cMake and makefile.
+  A large chunk of work: PRs 459, 471, 502.
+  NOTE: introduces new dependency (XSIMD), added to CMake and makefile.
 * Exploiting even/odd symmetry for 10% faster xsimd-accel kernel poly eval
-  Libin Lu based on idea of Martin Reinecke (PR477,492,493).
+  (Libin Lu based on idea of Martin Reinecke; PR477,492,493).
 * new test/finufft3dkernel_test checks kerevalmeth=0 and 1 agree to tolerance
   PR 473 (M Barbone).
 * new perftest/compare_spreads.jl compares two spreadinterp libs (A Barnett).
@@ -47,24 +50,24 @@ V 2.3.0beta (7/24/24)
   any 32-bit integers to 64-bit when calling cufinufft(f)_setpts. Note that
   internally, 32-bit integers are still used, so calling cufinufft with more
   than 2e9 points will fail. This restriction may be lifted in the future.
-* cmake build system revamped completely, more modern practices.
-  It auto selects compiler flags based on the supported ones on all operating systems.
-  Added support for Windows (llvm, msvc), Linux (llvm, gcc) and MacOS (llvm, gcc).
-* cmake support for both ducc0 and fftw
-* cmake adding nvcc and msvc optimization flags
-* cmake supports sphinx
-* updated install docs
-* cuFINUFFT binsize is now a function of the shared memory available where
-  possible.
-* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
-* cuFINUFFT using the new normalized Horner coefficients and added support
-  for 1.25.
-* cuFINUFFT new compile flags for extra-vectorization, flushing single
-  precision denormals to 0 and using fma where possible.
-* cuFINUFFT using intrinsics in foldrescale and other places to increase
-  performance
-* cuFINUFFT using SM90 float2 vector atomicAdd where supported
-* cuFINUFFT making default binsize = 0
+* CMake build system revamped completely, using more modern practices (Barbone).
+  It now auto-selects compiler flags based on those supported on all OSes, and
+  has support for Windows (llvm, msvc), Linux (llvm, gcc) and MacOS (llvm, gcc).
+* CMake added nvcc and msvc optimization flags.
+* sphinx local doc build also using CMake. (Barbone)
+* updated install docs, including for DUCC0 FFT and new python build.
+* updated install docs (Barnett)
+* Major acceleration effort for the GPU library cufinufft (M Barbone, PR488):
+  - binsize is now a function of the shared memory available where possible.
+  - GM 1D sorts using thrust::sort instead of bin-sort.
+  - uses the new normalized Horner coefficients and added support for
+    upsampfac=1.25 on GPU, for first time.
+  - new compile flags for extra-vectorization, flushing single
+    precision denormals to 0 and using fma where possible.
+  -  using intrinsics (eg FMA) in foldrescale and other places to increase
+    performance
+  - using SM90 float2 vector atomicAdd where supported
+  - make default binsize = 0
 
 V 2.2.0 (12/12/23)
 
diff --git a/cmake/setupCPM.cmake b/cmake/setupCPM.cmake
index b82bd82b7..610f1572b 100644
--- a/cmake/setupCPM.cmake
+++ b/cmake/setupCPM.cmake
@@ -1,18 +1,21 @@
 # USING CPM TO HANDLE DEPENDENCIES
 if(CPM_SOURCE_CACHE)
-    set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+  set(CPM_DOWNLOAD_LOCATION
+      "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 elseif(DEFINED ENV{CPM_SOURCE_CACHE})
-    set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+  set(CPM_DOWNLOAD_LOCATION
+      "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 else()
-    set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+  set(CPM_DOWNLOAD_LOCATION
+      "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 endif()
 
 if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
-    message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
-    file(DOWNLOAD
-        https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
-        ${CPM_DOWNLOAD_LOCATION}
-    )
+  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
+  file(
+    DOWNLOAD
+    https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+    ${CPM_DOWNLOAD_LOCATION})
 endif()
 
 include(${CPM_DOWNLOAD_LOCATION})
diff --git a/cmake/setupXSIMD.cmake b/cmake/setupXSIMD.cmake
index 6e1fd641d..5f31ae61e 100644
--- a/cmake/setupXSIMD.cmake
+++ b/cmake/setupXSIMD.cmake
@@ -1,20 +1,28 @@
-CPMAddPackage(
-        NAME xtl
-        GIT_REPOSITORY "https://github.com/xtensor-stack/xtl.git"
-        GIT_TAG ${XTL_VERSION}
-        EXCLUDE_FROM_ALL YES
-        GIT_SHALLOW YES
-        OPTIONS "XTL_DISABLE_EXCEPTIONS YES"
-)
-
-CPMAddPackage(
-        NAME xsimd
-        GIT_REPOSITORY "https://github.com/xtensor-stack/xsimd.git"
-        GIT_TAG ${XSIMD_VERSION}
-        EXCLUDE_FROM_ALL YES
-        GIT_SHALLOW YES
-        OPTIONS
-            "XSIMD_SKIP_INSTALL YES"
-            "XSIMD_ENABLE_XTL_COMPLEX YES"
-)
+cpmaddpackage(
+  NAME
+  xtl
+  GIT_REPOSITORY
+  "https://github.com/xtensor-stack/xtl.git"
+  GIT_TAG
+  ${XTL_VERSION}
+  EXCLUDE_FROM_ALL
+  YES
+  GIT_SHALLOW
+  YES
+  OPTIONS
+  "XTL_DISABLE_EXCEPTIONS YES")
 
+cpmaddpackage(
+  NAME
+  xsimd
+  GIT_REPOSITORY
+  "https://github.com/xtensor-stack/xsimd.git"
+  GIT_TAG
+  ${XSIMD_VERSION}
+  EXCLUDE_FROM_ALL
+  YES
+  GIT_SHALLOW
+  YES
+  OPTIONS
+  "XSIMD_SKIP_INSTALL YES"
+  "XSIMD_ENABLE_XTL_COMPLEX YES")
diff --git a/docs/devnotes.rst b/docs/devnotes.rst
index 61e79cca6..bf008d85a 100644
--- a/docs/devnotes.rst
+++ b/docs/devnotes.rst
@@ -27,11 +27,11 @@ Developer notes
 
 * The kernel function in spreadinterp is evaluated via piecewise-polynomial approximation (Horner's rule). The code for this is auto-generated in MATLAB, for all upsampling factors. There are two versions supported:
 
-  - 2018--2024 vintage: no explicit SIMD vectorization, C code is generated code for the Horner evaluation loop, by running from MATLAB `gen_all_horner_C_code.m`
+  - 2018--2024 vintage: no explicit SIMD vectorization, C code is generated code for the Horner evaluation loop, by running from MATLAB ``gen_all_horner_C_code.m``
 
-  - post-2024 vintage: explicit SIMD and many other acceleration tricks, and the generated code is a static C++ array of coefficients, and their sizes (`nc` or number of coefficients) for each width `w`. Run from MATLAB `gen_ker_horner_loop_cpp_code.m`
+  - post-2024 vintage: explicit SIMD and many other acceleration tricks, and the generated code is a static C++ array of coefficients, and their sizes (``nc`` or number of coefficients) for each width ``w``. Run from MATLAB ``gen_ker_horner_loop_cpp_code.m``
 
-  See `devel/README` for more details. The ES kernel coefficient and poly approx degree for both of the above are defined in a single location, `devel/get_degree_and_beta.m`, which must match the C++ `setup_spreader()` function.
+  See ``devel/README`` for more details. The ES kernel coefficient and poly approx degree for both of the above are defined in a single location, ``devel/get_degree_and_beta.m``, which must match the C++ ``setup_spreader()`` function.
 
 * Continuous Integration (CI). See files for this in ``.github/workflows/``. It currently tests the default ``makefile`` settings in linux, and three other ``make.inc.*`` files covering OSX and Windows (MinGW). CI does not test build the variant OMP=OFF. The dev should test these locally. Likewise, the Julia wrapper is separate and thus not tested in CI. We have added ``JenkinsFile`` for the GPU CI via python wrappers.
 
@@ -49,7 +49,9 @@ Developer notes
 
 * The cufinufft Python wheels are generated using Docker based on the manylinux2014 image. For instructions, see ``tools/cufinufft/distribution_helper.sh``. These are binary wheels that are built using CUDA 11 (or optionally CUDA 12, but these are not distributed on PyPI) and bundled with the necessary libraries.
 
-* Testing cufinufft (for FI, mostly)
+* CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
+
+* Testing cufinufft (for FI, mostly):
 
 .. code-block:: sh
 
diff --git a/docs/install.rst b/docs/install.rst
index 22fa8e730..ca8b38dde 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -3,36 +3,49 @@
 Installation
 ============
 
-There are two main ways to compile this library from source:
-via CMake (the recommended modern way, being more platform-independent),
+There are two main routes to compile the CPU library from source:
+via CMake (the recommended modern way, being more platform-independent, and also the
+only way to build the GPU library),
 or via a GNU ``makefile`` (which has various settings for linux, OSX, Windows).
 We currently support both, and detail them in that order in the text below.
+The only requirement is a C/C++ compiler supporting OpenMP and the C++17
+standard.
+FINUFFT builds with no issues on Linux and MacOS using any compiler, and in our experience (as of 2024), GCC13 gives the best performance. We do not recommend any GCC version prior to 9, due to vectorization issues.
+
+.. note::
+  There are now two choices of FFT library for the CPU build:
+
+    * `FFTW3 <https://www.fftw.org>`_ (its single- and double-precision libraries must then already be installed), or
+    * `DUCC0 FFT <https://gitlab.mpcdf.mpg.de/mtr/ducc>`_ (which is automatically installed into the ``deps`` subdirectory by CMake or GNU make).
+
+  Both are available in either CMake or GNU make build routes. Currently FFTW3 is the default in both routes, since DUCC0 is new as of FINUFFT v2.3 and not as well tested. DUCC0 is from the same author as `PocketFFT <https://gitlab.mpcdf.mpg.de/mtr/pocketfft>`_ (used, for instance, by `scipy <https://scipy.org/>`_); however, DUCC0 FFT is more optimized than PocketFFT. Choosing DUCC0 also exploits the block-sparsity structure in 2D and 3D transforms, and is generally faster than FFTW3 in those cases. In 1D, the relative speed of FFTW3 and DUCC0 varies depending on `N` and the batch size. DUCC0 has no plan stage, whereas FFTW3 requires a plan stage. Some idea of their relative performance can be found in `this discussion <https://github.com/flatironinstitute/finufft/pull/463#issuecomment-2223988300>`_. We encourage the power user to try switching to DUCC to see if it is faster in their setting.
+
 If you cannot get FINUFFT to compile, as a last resort you might find
 a precompiled binary for your platform under Assets for various
 `releases <https://github.com/flatironinstitute/finufft/releases>`_.
 Please post an `Issue <https://github.com/flatironinstitute/finufft/issues>`_
 to document your installation problem.
-When using CMake, finufft requires no external dependencies except (c/c++ compilers supporting OpenMP and c++17).
-However GNU ``makefile`` assumes that FFTW (single/double) are installed.
-Python-only users can simply install via ``pip install finufft`` which downloads a generic binary from PyPI. Only if you prefer a custom compilation, see :ref:`below<install-python>`.
+
+Python-only users can simply install via ``pip install finufft`` which downloads a generic binary from PyPI. If you prefer a local Python package build, see :ref:`below<install-python>`.
 
 .. note::
-    finufft builds with no issues on Linux and MacOS using any compiler, in our experience GCC-13 gives best performance.
-    On Windows MSVC works fine. The llvm toolchain included in Visual Studio does not seem to have OpenMP, it is possible to build single-threaded FINUFFT.
-    The official windows LLVM distribution builds finufft with no issues but debug builds using sanitizers break.
-    On windows finufft built with MSVC requires ``VCOMP140D.DLL`` which is part of the `Microsoft Visual C++ Redistributable <https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170>`_.
+    Here are some overall notes about Windows. On Windows, MSVC works fine. However, the LLVM toolchain included in Visual Studio does not seem to have OpenMP, but it is still possible to build single-threaded FINUFFT.
+    The official windows LLVM distribution builds FINUFFT with no issues, but debug builds using sanitizers break.
+    On Windows with MSVC, FINUFFT also requires ``VCOMP140D.DLL`` which is part of the `Microsoft Visual C++ Redistributable <https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170>`_.
     It is likely to be already installed in your system.
-    If the library is built with LLVM it requires ``libomp140.x86.64.dll``, more information `here <https://devblogs.microsoft.com/cppblog/improved-openmp-support-for-cpp-in-visual-studio/>`_.
+    If the library is built on Windows with LLVM, it requires ``libomp140.x86.64.dll``; see `here <https://devblogs.microsoft.com/cppblog/improved-openmp-support-for-cpp-in-visual-studio/>`_.
 
 
-CMake CPM Based Installation
-----------------------------
+Including FINUFFT into your own CMake project
+---------------------------------------------
 
-This is the easiest way to install ``finufft`` if you are using CMake in your own project.
-First include `CPM <https://github.com/cpm-cmake/CPM.cmake>`_ to your project.
-
-The easiest way is to follow the `instructions <https://github.com/cpm-cmake/CPM.cmake/wiki/Downloading-CPM.cmake-in-CMake>`_ to automatically add CPM to cmake.
+This is the easiest way to install and use FINUFFT if you already use
+CMake in your own project, since CMake automates all aspects of
+installation and compilation.
+There are two options: CPM or FetchContent.
+We recommend the first.
 
+1) **CPM**. First include `CPM <https://github.com/cpm-cmake/CPM.cmake>`_ to your project, by following the `instructions <https://github.com/cpm-cmake/CPM.cmake/wiki/Downloading-CPM.cmake-in-CMake>`_ to automatically add CPM to CMake.
 Then add the following to your ``CMakeLists.txt``:
 
 .. code-block:: cmake
@@ -53,14 +66,10 @@ Then add the following to your ``CMakeLists.txt``:
 
   target_link_library(your_executable [PUBLIC|PRIVATE|INTERFACE] finufft)
 
-Then cmake will automatically download the library and link it to your executable.
-
-CMake FetchContent Based Installation
-----------------------------
+Then CMake will automatically download FINUFFT and link it to your executable.
 
-Another way to include finufft in the project is to use FetchContent
-which is provided directly by cmake.
-To do so add the following to your ``CMakeLists.txt``:
+2) **FetchContent**: This tool is provided directly by CMake.
+Add the following to your ``CMakeLists.txt``:
 
 .. code-block:: cmake
 
@@ -79,14 +88,14 @@ To do so add the following to your ``CMakeLists.txt``:
     # Optionally, link the finufft library to your target
     target_link_libraries(your_executable [PUBLIC|PRIVATE|INTERFACE] finufft)
 
-Then cmake will automatically download the library and link it to your executable.
+Then CMake will automatically download FINUFFT and link it to your executable.
 
-CMake Based Installation
-------------------------
+CMake based installation and compilation
+----------------------------------------
 
-These instructions are in draft form.
 Make sure you have ``cmake`` version at least 3.19.
-The basic quick download, building, and test is then:
+The basic quick download, default building, and test and install
+is then done by:
 
 .. code-block:: bash
 
@@ -94,19 +103,19 @@ The basic quick download, building, and test is then:
   cd finufft
   cmake -S . -B build -DFINUFFT_BUILD_TESTS=ON --install-prefix /path/to/install
   cmake --build build
-  ctest --test-dir build/
+  ctest --test-dir build
   cmake --install build
 
+In ``build``, this creates the static library (``libfinufft.a`` on linux or OSX), and runs a test that should take a
+couple of seconds and report ``100% tests passed, 0 tests failed out of 17``. It then attempts to install the library.
+To instead build a shared library, see the ``FINUFFT_STATIC_LINKING`` CMake option below.
+
 .. note::
 
-   If you don't supply `--install-prefix`, it will default to ``/usr/local`` on most systems. If you don't have root access, you must supply a prefix you can write to such as ``$HOME/local``.
+   The use of ``--install-prefix`` and the final install command are optional, if the user is happy working with the static library in ``build``. If you don't supply ``--install-prefix``, it will default to ``/usr/local`` on most systems. If you don't have root access for your install directory, it will complain. If you supply a prefix, make sure it is one you can write to, such as ``$HOME/local``.
 
-In ``build``, this creates ``libfinufft.a`` or ``libfinufft.so``, and runs a test that should take a
-few seconds and report ``100% tests passed, 0 tests failed out of 17``.  To use the library, link against
-either the static or dynamic library in ``build`` or your installed version
-(i.e. ``/path/to/install/lib64/libfinufft.so`` or ``/path/to/install/lib/libfinufft.so``). If you install
-anywhere other than standard system wide locations (``/usr/local``), building/linking requires you specify the
-location of the library. If you link the shared library, you should also tell your compiled binary to store
+To use the library, link against either the static or dynamic library in ``build``, or your installed version
+(i.e. ``/path/to/install/lib64/libfinufft.so`` or ``/path/to/install/lib/libfinufft.so``). If you link to the shared library, you should also tell your compiled binary to store
 the location of that library in its ``RPATH``. Let's say you installed with the prefix ``$HOME/local``, your
 system prefers the ``lib64`` library directory, and you're still in the build directory. Then...
 
@@ -115,45 +124,43 @@ system prefers the ``lib64`` library directory, and you're still in the build di
   g++ -o simple1d1 ../examples/simple1d1.cpp -I$HOME/local/include -L$HOME/local/lib64 -Wl,-rpath $HOME/local/lib64 -lfinufft -O2
 
 
-will manually build the ``simple1d1`` example and drop it in the current directory.
+will manually build the executable for our ``simple1d1`` example, and drop it in the current directory.
 
-Here are all our build options, showing name, explanatory text, and default value, straight from the ``CMakeLists.txt`` file:
+Here are our CMake build options, showing name, explanatory text, and default value, straight from the ``CMakeLists.txt`` file:
 
 .. literalinclude:: ../CMakeLists.txt
    :language: cmake
    :start-after: @cmake_opts_start
    :end-before: @cmake_opts_end
 
-.. note::
-    It is possible to choose between ``FFTW`` and `ducc fft <https://gitlab.mpcdf.mpg.de/mtr/ducc>`_, ducc fft is from the same author as `pocket fft <https://gitlab.mpcdf.mpg.de/mtr/pocketfft>`_.
-    Pocket fft is the fft used by `scipy <https://scipy.org/>`_.
-    An idea about ducc performance can be found in `this discussion <https://github.com/flatironinstitute/finufft/pull/463#issuecomment-2223988300>`_. We encourage the power user to try switching to ducc to see if it improves performance.
-
-.. warning::
-    Note to the user, using --fast-math or /fp:fast can break finufft and its tests.
-    On windows with msvc cl, ``ducc fft`` has to compile with ``/fp:fast``, otherwise some tests (run_finufft3d_test_float, run_finufft3dmany_test_float) may fail because of the resulting error is larger than the tolerance.
-    On the other hand, finufft on windows with msvc cl should not compile with flag ``/fp:fast``, with ``/fp:fast`` the test run_dumbinputs_double will result in segfault, because /fp:fast makes values (NaN, +infinity, -infinity, -0.0) may not be propagated or behave strictly according to the IEEE-754 standard.
-
 For convenience we also provide a number of `cmake presets <https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html>`_
 for various options and compilers, in ``CMakePresets.json`` (this will grow to replace the old ``make.inc.*`` site files).
 For example, to configure, build and test the development preset (which builds tests and examples), from ``build`` do:
 
 .. code-block:: bash
 
-  cmake -S . -B build --preset dev # dev is the preset name
+  cmake -S . -B build --preset dev            # dev is the name of the preset
   cmake --build build
-  ctest --test-dir build/
+  ctest --test-dir build
+
+From other CMake projects, to use ``finufft`` as a library after building as above, simply add this repository as a subdirectory using
+``add_subdirectory``, and use ``target_link_library(your_executable finufft)``.
+
+Notes on compiler flags for various systems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These apply to CMake (as above), or GNU make (as below).
 
 .. warning::
+    Using ``--fast-math`` or ``/fp:fast`` can break FINUFFT and its tests.
+    On windows with msvc cl, ``DUCC0 FFT`` has to compile with ``/fp:fast``, otherwise some tests (run_finufft3d_test_float, run_finufft3dmany_test_float) may fail because of the resulting error is larger than the tolerance.
+    On the other hand, finufft on Windows with msvc cl should not compile with flag ``/fp:fast``, with ``/fp:fast`` the test run_dumbinputs_double will result in segfault, because ``/fp:fast`` makes values (NaN, +infinity, -infinity, -0.0) may not be propagated or behave strictly according to the IEEE-754 standard.
 
-  Intel compilers (unlike GPU compilers) currently engage ``fastmath`` behavior with ``-O2`` or ``-O3``. This may interfere with our use of ``std::isfinite`` in our test codes. For this reason in the Intel presets ``icx`` and ``icc`` have set ``-fp-model=strict``. You may get more speed if you remove this flag, or try ``-fno-finite-math-only``.
+.. warning::
 
-From other CMake projects, to use ``finufft`` as a library, simply add this repository as a subdirectory using
-``add_subdirectory``, and use ``target_link_library(your_executable finufft)``.
+  Intel compilers (unlike GPU compilers) currently engage ``fastmath`` behavior with ``-O2`` or ``-O3``. This may interfere with our use of ``std::isfinite`` in our source and test codes. For this reason in the Intel presets ``icx`` and ``icc`` have set ``-fp-model=strict``. You may get more speed if you remove this flag, or try ``-fno-finite-math-only``.
 
-.. note::
 
-   CMake compiling on linux at Flatiron Institute (Rusty cluster). We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
 
 
 Classic GNU make based route
@@ -168,61 +175,73 @@ PowerPC. The general procedure to download, then compile for such a special setu
   cp make.inc.powerpc make.inc
   make test -j
 
-Have a look for ``make.inc.*`` to see what is available, and/or edit your ``make.inc`` based on looking in the ``makefile`` and quirks of your local setup. As of 2021, we have continuous integration which tests the default (linux) settings in this ``makefile``, plus those in three OS-specific setup files::
+Have a look for ``make.inc.*`` to see what is available, and/or edit your ``make.inc`` based on looking in the ``makefile`` and quirks of your local setup. We have continuous integration which tests the default (linux) settings in this ``makefile``, plus those in three OS-specific setup files, currently::
 
   make.inc.macosx_clang
-  make.inc.macosx_gcc-10
+  make.inc.macosx_gcc-12
   make.inc.windows_msys
 
+Thus, those are the recommended files for OSX or Windows users to try as their ``make.inc``.
 If there is an error in testing on what you consider a standard set-up,
 please file a detailed bug report as a New Issue at https://github.com/flatironinstitute/finufft/issues
 
-
 Quick linux GNU make install instructions
---------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Make sure you have packages ``fftw3`` and ``fftw3-dev`` (or their
-equivalent on your distro) installed.
+Unless you select ``FFT=DUCC``, make sure you have packages ``fftw3`` and ``fftw3-dev`` (or their equivalent on your distro) installed.
 Then ``cd`` into your FINUFFT directory and do ``make test -j``.
 This should compile the static
 library in ``lib-static/``, some C++ test drivers in ``test/``, then run them,
 printing some terminal output ending in::
 
-  0 segfaults out of 8 tests done
-  0 fails out of 8 tests done
+  0 segfaults out of 9 tests done
+  0 fails out of 9 tests done
 
 This output repeats for double then single precision (hence, scroll up to check the double also gave no fails).
-If this fails, see the more detailed instructions below.
+If this fails, see the more detailed instructions/tips below.
 If it succeeds,
 please look in ``examples/``, ``test/``, and the rest of this manual,
 for examples of how to call and link to the library.
-Type ``make`` to see a list of other aspects the user can build
-(examples, language interfaces, etc).
+
+
+Make build tasks and options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here are the GNU make tasks and options, taken from the current ``makefile`` output:
+
+.. literalinclude:: makefile.doc
+
+The variables ``OMP`` and ``FFT`` need to be used consistently for downstream make tasks
+(e.g: ``make test -j && make examples FFT=DUCC`` will fail).
+They can instead be set in your ``make.inc``.
+A ``make objclean`` (eg, via ``make clean``) is needed before changing use of such variables.
+As usual, user environment variables are also visible to GNU make.
+
 
 Dependencies
-------------
+~~~~~~~~~~~~
 
 This library is fully supported for unix/linux, and partially for
 Mac OSX for Windows (eg under MSYS or WSL using MinGW compilers).
 
-For the basic libraries you need
+For the basic libraries you must have:
 
-* C++ compiler supporting C++14, such ``g++`` in GCC (version >=5.0), or ``clang`` (version >=3.4)
-* FFTW3 (version at least 3.3.6) including its development libraries
+* C++ compiler supporting C++17, such ``g++`` in GCC, or ``clang`` (version >=3.4)
 * GNU ``make`` and other standard unix/POSIX tools such as ``bash``
 
-Optional:
+Optionally you need:
 
+* By default (unless ``FFT=DUCC``) FFTW3 (version at least 3.3.6) including its development libraries
 * for Fortran wrappers: compiler such as ``gfortran`` in GCC
 * for MATLAB wrappers: MATLAB (versions at least R2016b up to current work)
 * for Octave wrappers: recent Octave version at least 4.4, and its development libraries
-* for the python wrappers you will need ``python`` version at least 3.6 (python 2 is unsupported), with ``numpy``.
+* for the python wrappers you will need ``python`` version at least 3.8 (python 2 is unsupported), with ``numpy``.
 
 
 1) Linux: tips for installing dependencies and compiling
 -------------------------------------------------------------------
 
-On a Fedora/CentOS linux system, the base dependencies can be installed by::
+On a Fedora/CentOS linux system, the base dependencies (including optional FFTW3) can be installed by::
 
   sudo yum install make gcc gcc-c++ fftw-devel libgomp
 
@@ -244,34 +263,12 @@ and for Fortran, Python, and Octave language interfaces also do::
 
 In older distros you may have to compile ``octave`` from source to get the needed >=4.4 version.
 
-You should then compile and test the library via various ``make`` tasks, eg::
-
-  make test -j
-
-then checking you got ``0 fails``.
-This compiles the main libraries then runs double- and single-precision tests, each of which should report zero segfaults and zero fails.
-
-.. note::
-
-   GCC versions on linux: long-term linux distros ship old GCC versions
-   that may not be C++17 compatible. We recommend that you
-   compile with a recent GCC, at least GCC 7.3 (which we used
-   for benchmarks in 2018 in our SISC paper), or GCC 9+. We do not recommend
-   GCC versions prior to 7. We also **do not recommend GCC8** since
-   its auto vectorization has worsened, and its kernel evaluation rate
-   using the default looped piecewise-polynomial Horner code drops to
-   less than 150 Meval/s/core on an i7. This contrasts 400-700
-   Meval/s/core achievable with GCC7 or GCC9 on i7. If you wish to
-   test these raw kernel evaluation rates, do into ``devel/``, compile
-   ``test_ker_ppval.cpp`` and run ``fig_speed_ker_ppval.m`` in MATLAB. We are
-   unsure if GCC8 is so poor in Mac OSX (see below).
-
+You should then compile and test the library via various ``make`` tasks, as discussed above.
 The make tasks (eg ``make lib``) compiles double and single precision functions,
 which live simultaneously in ``libfinufft``, with distinct function names.
 
-The only selectable option at compile time is
-multithreaded (default, using OpenMP) vs single-threaded
-(to achieve this append ``OMP=OFF`` to the make tasks).
+The make variable ``OMP=OFF`` builds a single-threaded library without
+reference to OpenMP.
 Since you may always set ``opts.nthreads=1`` when calling the multithreaded
 library, the point of having a single-threaded library is
 mostly for small repeated problems to avoid *any* OpenMP overhead, or
@@ -293,21 +290,17 @@ Since these call many tiny problem sizes, they will (due to openmp and fftw thre
 run much faster with less than the full thread count, explaining our use of 4 threads.
 Text (and stderr) outputs are written into ``test/results/*.out``.
 
-Use ``make perftest`` for larger spread/interpolation and NUFFT tests taking 10-20 seconds. This writes log files into ``test/results/`` where you will be able to compare to results from standard CPUs.
-
-Run ``make`` without arguments for full list of possible make tasks.
-
-``make examples`` to compile and run the examples for calling from C++ and from C.
+Use ``make perftest`` for larger spread/interpolation and NUFFT tests taking 30=60 seconds. This writes log files into ``test/results/``.
 
-``make fortran`` to compile and run the fortran wrappers and examples.
+Run ``make`` without arguments for full list of possible make tasks (see above).
 
 **High-level interfaces**.
 See :ref:`below<install-python>` for python compilation.
 
 ``make matlab`` to compile the MEX interface to matlab,
 then within MATLAB add the ``matlab`` directory to your path,
-cd to ``matlab/test`` and run ``check_finufft`` which should run for 5 secs
-and print a bunch of errors around ``1e-6``.
+cd to ``matlab/test`` and run ``check_finufft`` which should run for 3 secs
+and print a bunch of errors of typical size ``1e-6``.
 
 .. note::
 
@@ -323,31 +316,12 @@ and print a bunch of errors around ``1e-6``.
 
 
 
-Compilation flags and make.inc settings
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This is for experts.
-Here are all the flags that the FINUFFT source responds to.
-Activate them by adding a line of the form ``CXXFLAGS+=-DMYFLAG`` in your ``make.inc``:
-
-* ``-DSINGLE``: This is internally used by our build process to switch
-  (via preprocessor macros) the source from double to single precision.
-  You should not need to use this flag yourself.
-
-Here are some other settings that you may need to adjust in ``make.inc``:
-
-* Switching to linking tests, examples, etc, with PTHREADS instead of the default OMP version of FFTW, is achieved by inserting into ``make.inc`` the line ``FFTWOMPSUFFIX = threads``.
-
-
-
-
-
 2) Mac OSX: tips for installing dependencies and compiling
 -----------------------------------------------------------
 
 .. note::
 
-   A brew package will come shortly; stay tuned. However, the below has been tested on 10.14 (Mojave) with both clang and gcc-8, and 10.15 (Catalina) with clang.
+   The below has been tested on 10.14 (Mojave) with both clang and gcc-8, and 10.15 (Catalina) with clang. The notes are a couple of years out of date (as of 2024).
 
 First you'll want to set up Homebrew, as follows. We assume a fresh OSX machine.
 If you don't have Xcode, install Command Line Tools
@@ -407,7 +381,7 @@ Whichever you picked, now try ``make test -j``, and clang should compile and you
 you should now ``make matlab``. You may need to do ``make matlab -j``; see
 https://github.com/flatironinstitute/finufft/issues/157 which needs attention.
 To test, open MATLAB, ``addpath matlab``,
-``cd matlab/test``, and ``check_finufft``, which should complete in around 5 seconds.
+``cd matlab/test``, and ``check_finufft``, which should complete in around 3 seconds.
 
 .. note::
 
@@ -431,7 +405,7 @@ appears to be essential. The basic idea is::
   make fortran
 
 which also compiles and tests the fortran interfaces.
-You may need to edit to ``g++-11``, or whatever your GCC version is,
+You may need to edit to ``g++-13``, or whatever your GCC version is,
 in your ``make.inc``.
 
 .. note::
@@ -460,9 +434,9 @@ section of ``mexopts.sh``.
 
 
 3) Windows GNU make: tips for compiling
--------------------------------
+------------------------------------------
 
-We have users who have adjusted the makefile to work - at least to some extent - on Windows 10. If you are only interested in calling from Octave (which already comes with MinGW-w64 and FFTW), then we have been told this can be done very simply: from within Octave, go to the ``finufft`` directory and do ``system('make octave')``. You may have to tweak ``OCTAVE`` in your ``make.inc`` in a similar fashion to below.
+We have users who have adjusted the makefile to work - at least to some extent - on Windows 10. We suggest switching to the above CMake route instead for Windows, since we will not invest much effort supporting the ``makefile`` for Windows. If you are only interested in calling from Octave (which already comes with MinGW-w64 and FFTW), then we have been told this can be done very simply: from within Octave, go to the ``finufft`` directory and do ``system('make octave')``. You may have to tweak ``OCTAVE`` in your ``make.inc`` in a similar fashion to below.
 
 More generally, please make sure to have a recent version of Mingw at hand, preferably with a 64bit version of gnu-make like the WinLibs standalone build of GCC and MinGW-w64 for Windows. Note that most MinGW-w64 distributions, such as TDM-GCC, do not feature the 64bit gnu-make. Fortunately, this limitation is only relevant to run the tests. To prepare the build of the static and dynamic libraries run::
 
@@ -480,21 +454,21 @@ In a similar fashion, the examples can now be build with ``make examples``. This
 
   make matlab
 
-For users who work with Windows using MSYS and MinGW compilers. Please
+For users who work with Windows using MSYS and MinGW compilers, please
 try::
 
   cp make.inc.windows_msys make.inc
   make test -j
 
-We seek help with Windows support. Also see https://github.com/flatironinstitute/finufft/issues
+Also see https://github.com/flatironinstitute/finufft/issues
 
 
 
 
 .. _install-python:
 
-Building a python interface to a locally compiled library
------------------------------------------------------------------------
+Building a Python interface to a locally compiled library
+---------------------------------------------------------
 
 Recall that the basic user may simply ``pip install finufft``,
 then check it worked via either (if you have ``pytest`` installed)::
@@ -507,32 +481,20 @@ or the older-style eyeball check with::
 
 which should report errors around ``1e-6`` and throughputs around 1-10 million points/sec.
 
-However, a user or developer may want to build a python wrapper to their locally
-compiled FINUFFT library, perhaps for more speed. We now describe this,
-for all OSes.
-We assume ``python3`` (hence ``pip3``; make sure you have that installed).
-
-First, compile the shared C++ library, via, eg ``make lib -j`` (using the old-style ``makefile``),
-or via::
-
-  make -p build
-  cd build
-  cmake ..
-  cmake --build . -j
-  cd ..
-
-You may then run::
+However, better performance will result by locally compiling the library on your CPU into a Python module. This can better exploit your CPU's capabilities than the ``pypi`` distribution that ``pip install finufft`` downloads.
+We assume ``python`` (hence ``pip``; make sure you have that installed), at least version 3.8. We now use the modern ``pyproject.toml`` build system,
+which locally compiles with cmake (giving you native performance on your CPU).
+For this, run::
 
-  pip3 install -e python/finufft
+  pip install python/finufft
 
-which builds the ``finufft`` Python module, linking to the ``.so``,
-and installs (in editable mode) via pip.
-You will see that the ``finufftc.*.so`` shared object appears in the ``python/finufft/finufft/`` directory.
+which compiles the library from source then installs the Python module.
+If you see a complaint about missing ``setup.py``, you need a more recent version of pip/python.
 You should then run the above tests. You could also run tests and examples via ``make python``.
 
 An additional performance test you could then do is::
 
-  python3 python/finufft/test/run_speed_tests.py
+  python python/finufft/test/run_speed_tests.py
 
 .. note::
 
diff --git a/docs/makefile.doc b/docs/makefile.doc
new file mode 100644
index 000000000..75684bde0
--- /dev/null
+++ b/docs/makefile.doc
@@ -0,0 +1,24 @@
+Makefile for FINUFFT library. Please specify your task:
+ make lib - build the main library (in lib/ and lib-static/)
+ make examples - compile and run all codes in examples/
+ make test - compile and run quick math validation tests
+ make perftest - compile and run (slower) performance tests
+ make fortran - compile and run Fortran tests and examples
+ make matlab - compile MATLAB interfaces (no test)
+ make octave - compile and test octave interfaces
+ make python - compile and test python interfaces
+ make all - do all the above (around 1 minute; assumes you have MATLAB, etc)
+ make spreadtest - compile & run spreader-only tests (no FFT)
+ make spreadtestall - small set spreader-only tests for CI use
+ make objclean - remove all object files, preserving libs & MEX
+ make clean - also remove all lib, MEX, py, and demo executables
+ make setup - check (and possibly download) dependencies
+ make setupclean - delete downloaded dependencies
+For faster (multicore) compilation, append, for example, -j8
+
+Make options:
+ 'make [task] OMP=OFF' for single-threaded (no refs to OpenMP)
+ 'make [task] FFT=DUCC' for DUCC0 FFT (otherwise uses FFTW3)
+ You must at least 'make objclean' before changing such options!
+
+Also see docs/install.rst and docs/README
diff --git a/makefile b/makefile
index cacd633c6..6fc4ad965 100644
--- a/makefile
+++ b/makefile
@@ -1,7 +1,7 @@
-# Makefile for FINUFFT
+# Makefile for FINUFFT (CPU code only, and its various interfaces)
 
 # For simplicity, this is the only makefile; there are no makefiles in
-# subdirectories. This makefile is useful to show humans how to compile
+# subdirectories. This makefile is also useful to show humans how to compile
 # FINUFFT and its various language interfaces and examples.
 # Users should not need to edit this makefile (doing so would make it hard to
 # stay up to date with the repo version). Rather, in order to change
@@ -14,6 +14,7 @@
 # Garrett Wright, Joakim Anden, Barnett: dual-prec lib build, Jun-Jul'20.
 # Windows compatibility, jonas-kr, Sep '20.
 # XSIMD dependency, Marco Barbone, June 2024.
+# DUCC optional dependency to replace FFTW3. Barnett/Lu, 8/6/24.
 
 # Compiler (CXX), and linking from C, fortran. We use GCC by default...
 CXX = g++
@@ -21,13 +22,12 @@ CC = gcc
 FC = gfortran
 CLINK = -lstdc++
 FLINK = $(CLINK)
-# Python version: we use python3 by default, but you may need to change...
 PYTHON = python3
 # baseline compile flags for GCC (no multithreading):
 # Notes: 1) -Ofast breaks isfinite() & isnan(), so use -O3 which now is as fast
 #        2) -fcx-limited-range for fortran-speed complex arith in C++
 #        3) we use simply-expanded (:=) makefile variables, otherwise confusing
-# 		 4) the extra math flags are for speed, but they do not impact accuracy
+#        4) the extra math flags are for speed, but they do not impact accuracy;
 #           they allow gcc to vectorize the code more effectively
 CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\
 		  -fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\
@@ -42,27 +42,40 @@ LIBS := -lm
 # multithreading for GCC: C++/C/Fortran, MATLAB, and octave (ICC differs)...
 OMPFLAGS = -fopenmp
 OMPLIBS = -lgomp
-MOMPFLAGS = -D_OPENMP
+# we bundle any libs mex needs here with flags...
+MOMPFLAGS = -D_OPENMP $(OMPLIBS)
 OOMPFLAGS =
 # MATLAB MEX compilation (also see below +=)...
-MFLAGS := -largeArrayDims
+MFLAGS := -DR2008OO -largeArrayDims
 # location of MATLAB's mex compiler (could add flags to switch GCC, etc)...
 MEX = mex
 # octave, and its mkoctfile and flags (also see below +=)...
 OCTAVE = octave
 MKOCTFILE = mkoctfile
-OFLAGS =
+OFLAGS = -DR2008OO
 # For experts only, location of MWrap executable (see docs/install.rst):
 MWRAP = mwrap
 
-# dependency root (relative to top directory)
+# root directory for dependencies to be downloaded:
 DEPS_ROOT := deps
 
-# xsimd dependency repo URL
+# xsimd header-only dependency repo
 XSIMD_URL := https://github.com/xtensor-stack/xsimd.git
 XSIMD_VERSION := 13.0.0
 XSIMD_DIR := $(DEPS_ROOT)/xsimd
 
+# DUCC sources optional dependency repo
+DUCC_URL := https://gitlab.mpcdf.mpg.de/mtr/ducc.git
+DUCC_VERSION := ducc0_0_34_0
+DUCC_DIR := $(DEPS_ROOT)/ducc
+# this dummy file used as empty target by make...
+DUCC_COOKIE := $(DUCC_DIR)/.finufft_has_ducc
+# for internal DUCC compile...
+DUCC_INCL := -I$(DUCC_DIR)/src
+DUCC_SRC := $(DUCC_DIR)/src/ducc0
+# for DUCC objects compile only (not our objects)...  *** check flags, pthreads?:
+DUCC_CXXFLAGS := -fPIC -std=c++17 -ffast-math
+
 # absolute path of this makefile, ie FINUFFT's top-level directory...
 FINUFFT = $(dir $(realpath $(firstword $(MAKEFILE_LIST))))
 
@@ -74,28 +87,40 @@ FINUFFT = $(dir $(realpath $(firstword $(MAKEFILE_LIST))))
 # -fPIC (position-indep code) needed to build dyn lib (.so)
 # Also, we force return (via :=) to the land of simply-expanded variables...
 INCL = -Iinclude -I$(XSIMD_DIR)/include
+# single-thread total list of math and FFT libs (now both precisions)...
+# (Note: finufft tests use LIBSFFT; spread & util tests only need LIBS)
+LIBSFFT := $(LIBS)
+ifeq ($(FFT),DUCC)
+  DUCC_SETUP := $(DUCC_COOKIE)
+# so FINUFFT build can see DUCC headers...
+  INCL += $(DUCC_INCL)
+  DUCC_OBJS := $(DUCC_SRC)/infra/string_utils.o $(DUCC_SRC)/infra/threading.o $(DUCC_SRC)/infra/mav.o $(DUCC_SRC)/math/gridding_kernel.o $(DUCC_SRC)/math/gl_integrator.o
+# FINUFFT's switchable FFT done via this compile directive...
+  CXXFLAGS += -DFINUFFT_USE_DUCC0
+else
+# link against FFTW3 single-threaded (leaves DUCC_OBJS and DUCC_SETUP undef)
+  LIBSFFT += -l$(FFTWNAME) -l$(FFTWNAME)f
+endif
 CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++17
 CFLAGS := $(CFLAGS) $(INCL) -fPIC
 # here /usr/include needed for fftw3.f "fortran header"... (JiriK: no longer)
 FFLAGS := $(FFLAGS) $(INCL) -I/usr/include -fPIC
 
-# single-thread total list of math and FFTW libs (now both precisions)...
-# (Note: finufft tests use LIBSFFT; spread & util tests only need LIBS)
-LIBSFFT := -l$(FFTWNAME) -l$(FFTWNAME)f $(LIBS)
-
 # multi-threaded libs & flags, and req'd flags (OO for new interface)...
 ifneq ($(OMP),OFF)
   CXXFLAGS += $(OMPFLAGS)
   CFLAGS += $(OMPFLAGS)
   FFLAGS += $(OMPFLAGS)
-  MFLAGS += $(MOMPFLAGS) -DR2008OO
-  OFLAGS += $(OOMPFLAGS) -DR2008OO
+  MFLAGS += $(MOMPFLAGS)
+  OFLAGS += $(OOMPFLAGS)
   LIBS += $(OMPLIBS)
-# omp override for total list of math and FFTW libs (now both precisions)...
-  LIBSFFT := -l$(FFTWNAME) -l$(FFTWNAME)_$(FFTWOMPSUFFIX) -l$(FFTWNAME)f -l$(FFTWNAME)f_$(FFTWOMPSUFFIX) $(LIBS)
+# fftw3 multithreaded libs...
+  ifneq ($(FFT),DUCC)
+    LIBSFFT += -l$(FFTWNAME)_$(FFTWOMPSUFFIX) -l$(FFTWNAME)f_$(FFTWOMPSUFFIX) $(OMPLIBS)
+  endif
 endif
 
-# name & location of library we're building...
+# name & location of shared library we're building...
 LIBNAME = libfinufft
 ifeq ($(MINGW),ON)
   DYNLIB = lib/$(LIBNAME).dll
@@ -123,8 +148,8 @@ OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o src/f
 OBJSF = $(OBJS:%.o=%_32.o)
 # precision-dependent library object files (compiled & linked only once)...
 OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o
-# all lib dual-precision objs
-OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI)
+# all lib dual-precision objs (note DUCC_OBJS empty if unused)
+OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) $(DUCC_OBJS)
 
 .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean
 
@@ -143,22 +168,23 @@ usage:
 	@echo " make octave - compile and test octave interfaces"
 	@echo " make python - compile and test python interfaces"
 	@echo " make all - do all the above (around 1 minute; assumes you have MATLAB, etc)"
-	@echo " make spreadtest - compile & run spreader-only tests (no FFTW)"
+	@echo " make spreadtest - compile & run spreader-only tests (no FFT)"
 	@echo " make spreadtestall - small set spreader-only tests for CI use"
 	@echo " make objclean - remove all object files, preserving libs & MEX"
 	@echo " make clean - also remove all lib, MEX, py, and demo executables"
 	@echo " make setup - check (and possibly download) dependencies"
 	@echo " make setupclean - delete downloaded dependencies"
-	@echo "For faster (multicore) making, append, for example, -j8"
+	@echo "For faster (multicore) compilation, append, for example, -j8"
 	@echo ""
 	@echo "Make options:"
-	@echo " 'make [task] OMP=OFF' for single-threaded (otherwise OpenMP)"
-	@echo " You must 'make objclean' before changing such options!"
+	@echo " 'make [task] OMP=OFF' for single-threaded (no refs to OpenMP)"
+	@echo " 'make [task] FFT=DUCC' for DUCC0 FFT (otherwise uses FFTW3)"
+	@echo " You must at least 'make objclean' before changing such options!"
 	@echo ""
 	@echo "Also see docs/install.rst and docs/README"
 
 # collect headers for implicit depends (we don't separate public from private)
-HEADERS = $(wildcard include/*.h include/finufft/*.h)
+HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS)
 
 # implicit rules for objects (note -o ensures writes to correct dir)
 %.o: %.cpp $(HEADERS)
@@ -174,9 +200,15 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h)
 %_32.o: %.f
 	$(FC) -DSINGLE -c $(FFLAGS) $< -o $@
 
-# included auto-generated code and xsimd header-lib dependency...
-src/spreadinterp.o: src/ker_horner_allw_loop_constexpr.h $(XSIMD_DIR)/include/xsimd/xsimd.hpp
-src/spreadinterp_32.o: src/ker_horner_allw_loop_constexpr.h $(XSIMD_DIR)/include/xsimd/xsimd.hpp
+# spreadinterp include auto-generated code, xsimd header-only dependency;
+# if FFT=DUCC also setup ducc with fft.h dependency on $(DUCC_SETUP)...
+# Note src/spreadinterp.cpp includes finufft/defs.h which includes finufft/fft.h
+# so fftw/ducc header needed for spreadinterp, though spreadinterp should not
+# depend on fftw/ducc directly?
+include/finufft/fft.h: $(DUCC_SETUP)
+SHEAD = $(wildcard src/*.h) $(XSIMD_DIR)/include/xsimd/xsimd.hpp
+src/spreadinterp.o: $(SHEAD)
+src/spreadinterp_32.o: $(SHEAD)
 
 
 # lib -----------------------------------------------------------------------
@@ -206,7 +238,10 @@ endif
 
 # examples (C++/C) -----------------------------------------------------------
 # build all examples (single-prec codes separate, and not all have one)...
-EXAMPLES = $(basename $(wildcard examples/*.c examples/*.cpp))
+EXAMPLES := $(basename $(wildcard examples/*.c examples/*.cpp))
+ifeq ($(OMP),OFF)
+  EXAMPLES := $(filter-out $(basename $(wildcard examples/*thread*.cpp)),$(EXAMPLES))
+endif
 examples: $(EXAMPLES)
 ifneq ($(MINGW),ON)
   # Windows-MSYS does not find the dynamic libraries, so we make a temporary copy
@@ -400,7 +435,7 @@ endif
 
 # python ---------------------------------------------------------------------
 python: $(STATICLIB) $(DYNLIB)
-	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install -e ./python/finufft
+	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install python/finufft
 # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps
 	$(PYTHON) python/finufft/test/run_accuracy_tests.py
 	$(PYTHON) python/finufft/examples/simple1d1.py
@@ -424,7 +459,7 @@ docker-wheel:
 	docker run --rm -e package_name=finufft -v `pwd`:/io libinlu/manylinux2010_x86_64_fftw /io/python/ci/build-wheels.sh
 
 
-# ================== SETUP OF EXTERNAL DEPENDENCIES ===============
+# ================== SETUP/COMPILE OF EXTERNAL DEPENDENCIES ===============
 
 define clone_repo
     @if [ ! -d "$(3)" ]; then \
@@ -443,12 +478,45 @@ define clone_repo
     fi
 endef
 
+# download: header-only, no compile needed...
 $(XSIMD_DIR)/include/xsimd/xsimd.hpp:
 	mkdir -p $(DEPS_ROOT)
-	@echo "Checking xsimd external dependency..."
+	@echo "Checking XSIMD external dependency..."
 	$(call clone_repo,$(XSIMD_URL),$(XSIMD_VERSION),$(XSIMD_DIR))
 	@echo "xsimd installed in deps/xsimd"
 
+# download DUCC... (an empty target just used to track if installed)
+$(DUCC_COOKIE):
+	mkdir -p $(DEPS_ROOT)
+	@echo "Checking DUCC external dependency..."
+	$(call clone_repo,$(DUCC_URL),$(DUCC_VERSION),$(DUCC_DIR))
+	touch $(DUCC_COOKIE)
+	@echo "DUCC installed in deps/ducc"
+
+# implicit rule for DUCC compile just needed objects, only used if FFT=DUCC.
+# Needed since DUCC has no makefile (yet).
+$(DUCC_SRC)/infra/string_utils.cc: $(DUCC_SETUP)
+$(DUCC_SRC)/infra/string_utils.o: $(DUCC_SRC)/infra/string_utils.cc
+	$(CXX) -c $(DUCC_CXXFLAGS) $(DUCC_INCL) $< -o $@
+$(DUCC_SRC)/infra/threading.cc: $(DUCC_SETUP)
+$(DUCC_SRC)/infra/threading.o: $(DUCC_SRC)/infra/threading.cc
+	$(CXX) -c $(DUCC_CXXFLAGS) $(DUCC_INCL) $< -o $@
+$(DUCC_SRC)/infra/mav.cc: $(DUCC_SETUP)
+$(DUCC_SRC)/infra/mav.o: $(DUCC_SRC)/infra/mav.cc
+	$(CXX) -c $(DUCC_CXXFLAGS) $(DUCC_INCL) $< -o $@
+$(DUCC_SRC)/math/gridding_kernel.cc: $(DUCC_SETUP)
+$(DUCC_SRC)/math/gridding_kernel.o: $(DUCC_SRC)/math/gridding_kernel.cc
+	$(CXX) -c $(DUCC_CXXFLAGS) $(DUCC_INCL) $< -o $@
+$(DUCC_SRC)/math/gl_integrator.cc: $(DUCC_SETUP)
+$(DUCC_SRC)/math/gl_integrator.o: $(DUCC_SRC)/math/gl_integrator.cc
+	$(CXX) -c $(DUCC_CXXFLAGS) $(DUCC_INCL) $< -o $@
+# -j with the following not working yet..., need to expand the wildcard as above...
+#$(DUCC_SRC)/%.cc: $(DUCC_SETUP)
+#$(DUCC_SRC)/%.o: $(DUCC_SRC)/%.cc
+#	$(CXX) -c $(DUCC_CXXFLAGS) $(DUCC_INCL) $< -o $@
+
+setup: $(XSIMD_DIR)/include/xsimd/xsimd.hpp $(DUCC_SETUP)
+
 setupclean:
 	rm -rf $(DEPS_ROOT)
 
@@ -457,6 +525,8 @@ setupclean:
 
 docs: docs/*.docsrc docs/matlabhelp.doc docs/makecdocs.sh
 	(cd docs; ./makecdocs.sh)
+# get the makefile help strings from make w/o args, stdout...
+	make 1> docs/makefile.doc
 docs/matlabhelp.doc: docs/genmatlabhelp.sh matlab/*.sh matlab/*.docsrc matlab/*.docbit matlab/*.m
 	(cd matlab; ./addmhelp.sh)
 	(cd docs; ./genmatlabhelp.sh)
@@ -472,7 +542,7 @@ ifneq ($(MINGW),ON)
 	rm -f matlab/*.mex*
 	rm -f $(TESTS) test/results/*.out perftest/results/*.out
 	rm -f $(EXAMPLES) $(FE) $(ST) $(STF) $(STA) $(STAF) $(GTT) $(GTTF)
-	rm -f perftest/manysmallprobs
+	rm -f perftest/manysmallprobs perftest/big2d2f
 	rm -f examples/core test/core perftest/core $(FE_DIR)/core
 else
   # Windows-WSL clean up...
@@ -481,7 +551,7 @@ else
 	for %%f in ($(subst /,\, $(TESTS))) do ((if exist %%f del %%f) & (if exist %%f.exe del %%f.exe))
 	del test\results\*.out perftest\results\*.out
 	for %%f in ($(subst /,\, $(EXAMPLES)), $(subst /,\,$(FE)), $(subst /,\,$(ST)), $(subst /,\,$(STF)), $(subst /,\,$(STA)), $(subst /,\,$(STAF)), $(subst /,\,$(GTT)), $(subst /,\,$(GTTF))) do ((if exist %%f del %%f) & (if exist %%f.exe del %%f.exe))
-	del perftest\manysmallprobs
+	del perftest\manysmallprobs, perftest\big2d2f
 	del examples\core, test\core, perftest\core, $(subst /,\, $(FE_DIR))\core
 endif
 
@@ -489,13 +559,15 @@ endif
 # indiscriminate .o killer; needed before changing threading...
 objclean:
 ifneq ($(MINGW),ON)
-  # non-Windows-WSL...
+  # non-Windows-WSL... (note: cleans DUCC objects regardless of FFT choice)
 	rm -f src/*.o test/directft/*.o test/*.o examples/*.o matlab/*.o contrib/*.o
 	rm -f fortran/*.o $(FE_DIR)/*.o $(FD)/*.o finufft_mod.mod
+	rm -f $(DUCC_SRC)/infra/*.o $(DUCC_SRC)/math/*.o
 else
   # Windows-WSL...
 	for /d %%d in (src,test\directfttest,examples,matlab,contrib) do (for %%f in (%%d\*.o) do (del %%f))
 	for /d %%d in (fortran,$(subst /,\, $(FE_DIR)),$(subst /,\, $(FD))) do (for %%f in (%%d\*.o) do (del %%f))
+  # *** to del DUCC *.o
 endif
 
 pyclean: