Optimized 1D and 2D

flatironinstitute · Jul 15, 2024 · 35dcc66 · 35dcc66
1 parent 60f4780
commit 35dcc66
Show file tree

Hide file tree

Showing 9 changed files with 407 additions and 365 deletions.
diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc
@@ -20,7 +20,7 @@
     constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
     constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
 for (int i=0; i<3; i++) {
-    ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
 }
   } else if (w==4) {
     constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02};
@@ -32,7 +32,7 @@ for (int i=0; i<3; i++) {
     constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01};
     constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00};
     for (int i=0; i<4; i++) {
-        ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
     }
   } else if (w==5) {
     constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02};
@@ -45,7 +45,7 @@ for (int i=0; i<3; i++) {
     constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00};
     constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00};
     for (int i=0; i<5; i++) {
-        ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
     }
   } else if (w==6) {
     constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};

diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
@@ -68,6 +68,19 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
   }
   return sqrt(nrm);
 }
+
+#ifdef __CUDA_ARCH__
+__forceinline__ __device__ auto interval(const int ns, const float x) {
+  const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
+  const auto xend   = __float2int_rd(__fmaf_rd(ns, .5f, x));
+  return int2{xstart, xend};
+}
+__forceinline__ __device__ auto interval(const int ns, const double x) {
+  const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
+  const auto xend   = __double2int_rd(__fma_rd(ns, .5, x));
+  return int2{xstart, xend};
+}
+#endif
 } // namespace utils
 } // namespace cufinufft
 

diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py
@@ -37,14 +37,13 @@ def build_args(args):
 # example command to run:
 # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
 # example arguments
-args = {"--prec": "f",
-        "--n_runs": "10",
+args = {"--prec": "d",
+        "--n_runs": "5",
         "--method": "0",
         "--sort": "1",
-        "--N1": "16777216",
-        # "--N2": "256",
-        # "--N1": "256",
-        # "--N2": "256",
+        # "--N1": "16777216",
+        "--N1": "256",
+        "--N2": "256",
         # "--N3": "256",
         "--kerevalmethod": "1",
         "--M": "1E8",
@@ -93,6 +92,10 @@ def build_args(args):
         conf = [x for x in stdout.splitlines() if x.startswith("#")]
         print('\n'.join(conf))
         stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7]
+        if stdout[0].startswith("bin"):
+            print(stdout[0])
+            stdout = stdout[1:]
+
         stdout = '\n'.join(stdout)
         # convert stdout to a dataframe from csv string
         dt = pd.read_csv(io.StringIO(stdout), sep=',')
@@ -153,7 +156,7 @@ def build_args(args):
 min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
 max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
 print(min_val, max_val)
-plt.ylim(min_val * .99, max_val * 1.01)
+plt.ylim(min_val * .90, max_val * 1.1)
 # plt.ylim(.8, 1.2)
 
 # Calculate the smallest power of 10
@@ -163,15 +166,15 @@ def build_args(args):
 # plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009)  # Adding 10% for upper margin
 
 # plot an horizontal line at 1 with label "GM"
-plt.axhline(y=1, color='k', linestyle='--', label='GM')
+# plt.axhline(y=1, color='k', linestyle='--', label='GM')
 plt.xlabel('Tolerance')
-plt.ylabel('Throughput (% of GM)')
+plt.ylabel('Throughput')
 plt.title('Throughput by Tolerance and Method')
 plt.legend(title='Method')
 plt.tight_layout()
 plt.show()
 plt.xlabel("Tolerance")
-plt.ylabel("Points/s (% of GM)")
+plt.ylabel("Points/s")
 plt.savefig("bench.png")
 plt.savefig("bench.svg")
 plt.savefig("bench.pdf")

diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
@@ -268,6 +268,7 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+
       cufinufft_set_shared_memory(spread_1d_subprob<T, 1>, 1, *d_plan);
       RETURN_IF_CUDA_ERROR
       spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(

diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh
@@ -23,26 +23,15 @@ template<typename T, int KEREVALMETH>
 __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c,
                                       cuda_complex<T> *fw, int M, int ns, int nf1, T es_c,
                                       T es_beta, T sigma, const int *idxnupts) {
-
+  // dynamic stack allocation to reduce stack usage
   auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns);
 
   for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
        i += blockDim.x * gridDim.x) {
     const auto x_rescaled     = fold_rescale(x[idxnupts[i]], nf1);
     const auto cnow           = c[idxnupts[i]];
-    const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept {
-      if constexpr (std::is_same_v<T, float>) {
-        const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled));
-        const auto xend   = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled));
-        return int2{xstart, xend};
-      }
-      if constexpr (std::is_same_v<T, double>) {
-        const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled));
-        const auto xend   = __double2int_rd(__fma_rd(ns, .5, x_rescaled));
-        return int2{xstart, xend};
-      }
-    }();
-    const T x1 = (T)xstart - x_rescaled;
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const T x1                = (T)xstart - x_rescaled;
     if constexpr (KEREVALMETH == 1)
       eval_kernel_vec_horner(ker1, x1, ns, sigma);
     else
@@ -126,27 +115,17 @@ __global__ void spread_1d_subprob(
   for (int i = threadIdx.x; i < N; i += blockDim.x) {
     fwshared[i] = {0, 0};
   }
+
+  const T ns_2f = ns * T(.5);
+
   __syncthreads();
 
   for (auto i = threadIdx.x; i < nupts; i += blockDim.x) {
-    const auto idx        = ptstart + i;
-    const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-    const auto cnow       = c[idxnupts[idx]];
-
-    const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept {
-      if constexpr (std::is_same_v<T, float>) {
-        const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled));
-        const auto xend   = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled));
-        return int2{xstart, xend};
-      }
-      if constexpr (std::is_same_v<T, double>) {
-        const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled));
-        const auto xend   = __double2int_rd(__fma_rd(ns, .5, x_rescaled));
-        return int2{xstart, xend};
-      }
-    }();
-
-    const T x1 = T(xstart + xoffset) - x_rescaled;
+    const auto idx            = ptstart + i;
+    const auto x_rescaled     = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto cnow           = c[idxnupts[idx]];
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const T x1                = T(xstart + xoffset) - x_rescaled;
     if constexpr (KEREVALMETH == 1)
       eval_kernel_vec_horner(ker1, x1, ns, sigma);
     else

diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu
@@ -4,10 +4,12 @@
 #include <cuComplex.h>
 #include <cufinufft/contrib/helper_cuda.h>
 
+#include <cufinufft/common.h>
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
 
 using namespace cufinufft::memtransfer;
+using namespace cufinufft::common;
 
 #include "spreadinterp2d.cuh"
 
@@ -120,17 +122,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
   int *d_subprob_to_bin  = d_plan->subprob_to_bin;
   int totalnumsubprob    = d_plan->totalnumsubprob;
 
-  T sigma                  = d_plan->opts.upsampfac;
-  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  T sigma = d_plan->opts.upsampfac;
+  const auto sharedplanorysize =
+      shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(interp_2d_subprob<T, 1>, 2, *d_plan);
       interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
@@ -140,6 +139,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
     }
   } else {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(interp_2d_subprob<T, 0>, 2, *d_plan);
       interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,

diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
@@ -7,6 +7,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
+#include <cufinufft/common.h>
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
@@ -273,16 +274,14 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
 
   T sigma = d_plan->opts.upsampfac;
 
-  size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * (int)ceil(ns / 2.0)) *
-                             sizeof(cuda_complex<T>);
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  const auto sharedplanorysize =
+      shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(spread_2d_subprob<T, 1>, 2, *d_plan);
+      RETURN_IF_CUDA_ERROR
       spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
@@ -292,6 +291,8 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
     }
   } else {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(spread_2d_subprob<T, 0>, 2, *d_plan);
+      RETURN_IF_CUDA_ERROR
       spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,