diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc
index c9c5e2ca2..1178a8544 100644
--- a/include/cufinufft/contrib/ker_horner_allw_loop.inc
+++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc
@@ -20,7 +20,7 @@
     constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
     constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
 for (int i=0; i<3; i++) {
-    ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
 }
   } else if (w==4) {
     constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02};
@@ -32,7 +32,7 @@ for (int i=0; i<3; i++) {
     constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01};
     constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00};
     for (int i=0; i<4; i++) {
-        ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
     }
   } else if (w==5) {
     constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02};
@@ -45,7 +45,7 @@ for (int i=0; i<3; i++) {
     constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00};
     constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00};
     for (int i=0; i<5; i++) {
-        ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
     }
   } else if (w==6) {
     constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 3455b99c0..b0a77aec7 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -68,6 +68,19 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
   }
   return sqrt(nrm);
 }
+
+#ifdef __CUDA_ARCH__
+__forceinline__ __device__ auto interval(const int ns, const float x) {
+  const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
+  const auto xend   = __float2int_rd(__fmaf_rd(ns, .5f, x));
+  return int2{xstart, xend};
+}
+__forceinline__ __device__ auto interval(const int ns, const double x) {
+  const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
+  const auto xend   = __double2int_rd(__fma_rd(ns, .5, x));
+  return int2{xstart, xend};
+}
+#endif
 } // namespace utils
 } // namespace cufinufft
 
diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py
index dbcaed87f..db7e73873 100644
--- a/perftest/cuda/bench.py
+++ b/perftest/cuda/bench.py
@@ -37,14 +37,13 @@ def build_args(args):
 # example command to run:
 # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
 # example arguments
-args = {"--prec": "f",
-        "--n_runs": "10",
+args = {"--prec": "d",
+        "--n_runs": "5",
         "--method": "0",
         "--sort": "1",
-        "--N1": "16777216",
-        # "--N2": "256",
-        # "--N1": "256",
-        # "--N2": "256",
+        # "--N1": "16777216",
+        "--N1": "256",
+        "--N2": "256",
         # "--N3": "256",
         "--kerevalmethod": "1",
         "--M": "1E8",
@@ -93,6 +92,10 @@ def build_args(args):
         conf = [x for x in stdout.splitlines() if x.startswith("#")]
         print('\n'.join(conf))
         stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7]
+        if stdout[0].startswith("bin"):
+            print(stdout[0])
+            stdout = stdout[1:]
+
         stdout = '\n'.join(stdout)
         # convert stdout to a dataframe from csv string
         dt = pd.read_csv(io.StringIO(stdout), sep=',')
@@ -153,7 +156,7 @@ def build_args(args):
 min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
 max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
 print(min_val, max_val)
-plt.ylim(min_val * .99, max_val * 1.01)
+plt.ylim(min_val * .90, max_val * 1.1)
 # plt.ylim(.8, 1.2)
 
 # Calculate the smallest power of 10
@@ -163,15 +166,15 @@ def build_args(args):
 # plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009)  # Adding 10% for upper margin
 
 # plot an horizontal line at 1 with label "GM"
-plt.axhline(y=1, color='k', linestyle='--', label='GM')
+# plt.axhline(y=1, color='k', linestyle='--', label='GM')
 plt.xlabel('Tolerance')
-plt.ylabel('Throughput (% of GM)')
+plt.ylabel('Throughput')
 plt.title('Throughput by Tolerance and Method')
 plt.legend(title='Method')
 plt.tight_layout()
 plt.show()
 plt.xlabel("Tolerance")
-plt.ylabel("Points/s (% of GM)")
+plt.ylabel("Points/s")
 plt.savefig("bench.png")
 plt.savefig("bench.svg")
 plt.savefig("bench.pdf")
diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
index e958bfea3..4e7f4ea0b 100644
--- a/src/cuda/1d/spread1d_wrapper.cu
+++ b/src/cuda/1d/spread1d_wrapper.cu
@@ -268,6 +268,7 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+
       cufinufft_set_shared_memory(spread_1d_subprob<T, 1>, 1, *d_plan);
       RETURN_IF_CUDA_ERROR
       spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh
index 68656c124..f94ffd7eb 100644
--- a/src/cuda/1d/spreadinterp1d.cuh
+++ b/src/cuda/1d/spreadinterp1d.cuh
@@ -23,26 +23,15 @@ template<typename T, int KEREVALMETH>
 __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c,
                                       cuda_complex<T> *fw, int M, int ns, int nf1, T es_c,
                                       T es_beta, T sigma, const int *idxnupts) {
-
+  // dynamic stack allocation to reduce stack usage
   auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns);
 
   for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
        i += blockDim.x * gridDim.x) {
     const auto x_rescaled     = fold_rescale(x[idxnupts[i]], nf1);
     const auto cnow           = c[idxnupts[i]];
-    const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept {
-      if constexpr (std::is_same_v<T, float>) {
-        const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled));
-        const auto xend   = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled));
-        return int2{xstart, xend};
-      }
-      if constexpr (std::is_same_v<T, double>) {
-        const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled));
-        const auto xend   = __double2int_rd(__fma_rd(ns, .5, x_rescaled));
-        return int2{xstart, xend};
-      }
-    }();
-    const T x1 = (T)xstart - x_rescaled;
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const T x1                = (T)xstart - x_rescaled;
     if constexpr (KEREVALMETH == 1)
       eval_kernel_vec_horner(ker1, x1, ns, sigma);
     else
@@ -126,27 +115,17 @@ __global__ void spread_1d_subprob(
   for (int i = threadIdx.x; i < N; i += blockDim.x) {
     fwshared[i] = {0, 0};
   }
+
+  const T ns_2f = ns * T(.5);
+
   __syncthreads();
 
   for (auto i = threadIdx.x; i < nupts; i += blockDim.x) {
-    const auto idx        = ptstart + i;
-    const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-    const auto cnow       = c[idxnupts[idx]];
-
-    const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept {
-      if constexpr (std::is_same_v<T, float>) {
-        const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled));
-        const auto xend   = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled));
-        return int2{xstart, xend};
-      }
-      if constexpr (std::is_same_v<T, double>) {
-        const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled));
-        const auto xend   = __double2int_rd(__fma_rd(ns, .5, x_rescaled));
-        return int2{xstart, xend};
-      }
-    }();
-
-    const T x1 = T(xstart + xoffset) - x_rescaled;
+    const auto idx            = ptstart + i;
+    const auto x_rescaled     = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto cnow           = c[idxnupts[idx]];
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const T x1                = T(xstart + xoffset) - x_rescaled;
     if constexpr (KEREVALMETH == 1)
       eval_kernel_vec_horner(ker1, x1, ns, sigma);
     else
diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu
index 533788482..eda0d579b 100644
--- a/src/cuda/2d/interp2d_wrapper.cu
+++ b/src/cuda/2d/interp2d_wrapper.cu
@@ -4,10 +4,12 @@
 #include <cuComplex.h>
 #include <cufinufft/contrib/helper_cuda.h>
 
+#include <cufinufft/common.h>
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
 
 using namespace cufinufft::memtransfer;
+using namespace cufinufft::common;
 
 #include "spreadinterp2d.cuh"
 
@@ -120,17 +122,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
   int *d_subprob_to_bin  = d_plan->subprob_to_bin;
   int totalnumsubprob    = d_plan->totalnumsubprob;
 
-  T sigma                  = d_plan->opts.upsampfac;
-  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  T sigma = d_plan->opts.upsampfac;
+  const auto sharedplanorysize =
+      shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(interp_2d_subprob<T, 1>, 2, *d_plan);
       interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
@@ -140,6 +139,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
     }
   } else {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(interp_2d_subprob<T, 0>, 2, *d_plan);
       interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
index 69b2ba956..d361791b0 100644
--- a/src/cuda/2d/spread2d_wrapper.cu
+++ b/src/cuda/2d/spread2d_wrapper.cu
@@ -7,6 +7,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
+#include <cufinufft/common.h>
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
@@ -273,16 +274,14 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
 
   T sigma = d_plan->opts.upsampfac;
 
-  size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * (int)ceil(ns / 2.0)) *
-                             sizeof(cuda_complex<T>);
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  const auto sharedplanorysize =
+      shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(spread_2d_subprob<T, 1>, 2, *d_plan);
+      RETURN_IF_CUDA_ERROR
       spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
@@ -292,6 +291,8 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
     }
   } else {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(spread_2d_subprob<T, 0>, 2, *d_plan);
+      RETURN_IF_CUDA_ERROR
       spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh
index 558984ea1..62a430ca5 100644
--- a/src/cuda/2d/spreadinterp2d.cuh
+++ b/src/cuda/2d/spreadinterp2d.cuh
@@ -15,314 +15,314 @@ namespace spreadinterp {
 /* ------------------------ 2d Spreading Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
 
-template <typename T, int KEREVALMETH>
-__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M,
-                                       int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
-    int xstart, ystart, xend, yend;
-    int xx, yy, ix, iy;
-    int outidx;
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    T x_rescaled, y_rescaled;
-    T kervalue1, kervalue2;
-    cuda_complex<T> cnow;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-      x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-      y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
-        cnow = c[idxnupts[i]];
-
-        xstart = ceil(x_rescaled - ns / 2.0);
-        ystart = ceil(y_rescaled - ns / 2.0);
-        xend = floor(x_rescaled + ns / 2.0);
-        yend = floor(y_rescaled + ns / 2.0);
-
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (yy = ystart; yy <= yend; yy++) {
-            for (xx = xstart; xx <= xend; xx++) {
-                ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                outidx = ix + iy * nf1;
-                kervalue1 = ker1[xx - xstart];
-                kervalue2 = ker2[yy - ystart];
-                atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2);
-                atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2);
-            }
-        }
+template<typename T, int KEREVALMETH>
+__global__ void spread_2d_nupts_driven(
+    const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const auto x_rescaled     = fold_rescale(x[idxnupts[i]], nf1);
+    const auto y_rescaled     = fold_rescale(y[idxnupts[i]], nf2);
+    const auto cnow           = c[idxnupts[i]];
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const auto [ystart, yend] = interval(ns, y_rescaled);
+
+    const auto x1 = (T)xstart - x_rescaled;
+    const auto y1 = (T)ystart - y_rescaled;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
     }
+
+    for (auto yy = ystart; yy <= yend; yy++) {
+      for (auto xx = xstart; xx <= xend; xx++) {
+        const auto ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+        const auto iy        = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
+        const auto outidx    = ix + iy * nf1;
+        const auto kervalue1 = ker1[xx - xstart];
+        const auto kervalue2 = ker2[yy - ystart];
+        atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2);
+        atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2);
+      }
+    }
+  }
 }
 
 /* Kernels for SubProb Method */
 // SubProb properties
-template <typename T>
-__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
+template<typename T>
+__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x,
+                                         int bin_size_y, int nbinx, int nbiny,
                                          int *bin_size, T *x, T *y, int *sortidx) {
-    int binidx, binx, biny;
-    int oldidx;
-    T x_rescaled, y_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-      x_rescaled = fold_rescale(x[i], nf1);
-      y_rescaled = fold_rescale(y[i], nf2);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-        biny = floor(y_rescaled / bin_size_y);
-        biny = biny >= nbiny ? biny - 1 : biny;
-        biny = biny < 0 ? 0 : biny;
-        binidx = binx + biny * nbinx;
-        oldidx = atomicAdd(&bin_size[binidx], 1);
-        sortidx[i] = oldidx;
-        if (binx >= nbinx || biny >= nbiny) {
-            sortidx[i] = -biny;
-        }
+  int binidx, binx, biny;
+  int oldidx;
+  T x_rescaled, y_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+    biny       = floor(y_rescaled / bin_size_y);
+    biny       = biny >= nbiny ? biny - 1 : biny;
+    biny       = biny < 0 ? 0 : biny;
+    binidx     = binx + biny * nbinx;
+    oldidx     = atomicAdd(&bin_size[binidx], 1);
+    sortidx[i] = oldidx;
+    if (binx >= nbinx || biny >= nbiny) {
+      sortidx[i] = -biny;
     }
+  }
 }
 
-template <typename T>
-__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
-                                                     const int *bin_startpts, const int *sortidx, const T *x,
-                                                     const T *y, int *index, int nf1, int nf2) {
-    int binx, biny;
-    int binidx;
-    T x_rescaled, y_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-      x_rescaled = fold_rescale(x[i], nf1);
-      y_rescaled = fold_rescale(y[i], nf2);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-        biny = floor(y_rescaled / bin_size_y);
-        biny = biny >= nbiny ? biny - 1 : biny;
-        biny = biny < 0 ? 0 : biny;
-        binidx = binx + biny * nbinx;
-
-        index[bin_startpts[binidx] + sortidx[i]] = i;
-    }
+template<typename T>
+__global__ void calc_inverse_of_global_sort_index_2d(
+    int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, const int *bin_startpts,
+    const int *sortidx, const T *x, const T *y, int *index, int nf1, int nf2) {
+  int binx, biny;
+  int binidx;
+  T x_rescaled, y_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+    biny       = floor(y_rescaled / bin_size_y);
+    biny       = biny >= nbiny ? biny - 1 : biny;
+    biny       = biny < 0 ? 0 : biny;
+    binidx     = binx + biny * nbinx;
+
+    index[bin_startpts[binidx] + sortidx[i]] = i;
+  }
 }
 
-template <typename T, int KEREVALMETH>
-__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
-                                  int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
-                                  int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
-                                  const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy;
-    int outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = (bidx / nbinx) * bin_size_y;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0));
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
-    }
-    __syncthreads();
-
-    T x_rescaled, y_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-      x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-      y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
-        cnow = c[idxnupts[idx]];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-
-        T x1 = (T)xstart + xoffset - x_rescaled;
-        T y1 = (T)ystart + yoffset - y_rescaled;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            iy = yy + ceil(ns / 2.0);
-            if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0)
-                break;
-            for (int xx = xstart; xx <= xend; xx++) {
-                ix = xx + ceil(ns / 2.0);
-                if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
-                    break;
-                outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2);
-                T kervalue1 = ker1[xx - xstart];
-                T kervalue2 = ker2[yy - ystart];
-                atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2);
-                atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2);
-            }
-        }
+template<typename T, int KEREVALMETH>
+__global__ void spread_2d_subprob(
+    const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
+    int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
+    const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
+    const int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
+
+  const int subpidx      = blockIdx.x;
+  const auto bidx        = subprob_to_bin[subpidx];
+  const auto binsubp_idx = subpidx - subprobstartpts[bidx];
+  const auto ptstart     = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+  const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+
+  const int xoffset = (bidx % nbinx) * bin_size_x;
+  const int yoffset = (bidx / nbinx) * bin_size_y;
+
+  const T ns_2f         = ns * T(.5);
+  const auto ns_2       = (ns + 1) / 2;
+  const auto rounded_ns = ns_2 * 2;
+  const int N           = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    fwshared[i] = {0, 0};
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
+    const int idx         = ptstart + i;
+    const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
+    const auto cnow       = c[idxnupts[idx]];
+    auto [xstart, xend]   = interval(ns, x_rescaled);
+    auto [ystart, yend]   = interval(ns, y_rescaled);
+
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+    xstart -= xoffset;
+    ystart -= yoffset;
+    xend -= xoffset;
+    yend -= yoffset;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
     }
 
-    __syncthreads();
-    /* write to global memory */
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = k / (bin_size_x + 2 * ceil(ns / 2.0));
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            outidx = ix + iy * nf1;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2);
-            atomicAdd(&fw[outidx].x, fwshared[sharedidx].x);
-            atomicAdd(&fw[outidx].y, fwshared[sharedidx].y);
-        }
+    for (int yy = ystart; yy <= yend; yy++) {
+      const auto iy = yy + ns_2;
+      if (iy >= (bin_size_y + rounded_ns) || iy < 0) break;
+      for (int xx = xstart; xx <= xend; xx++) {
+        const auto ix = xx + ns_2;
+        if (ix >= (bin_size_x + rounded_ns) || ix < 0) break;
+        const auto outidx   = ix + iy * (bin_size_x + rounded_ns);
+        const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart];
+        const auto resx     = cnow.x * kervalue;
+        const auto resy     = cnow.y * kervalue;
+        atomicAdd(&fwshared[outidx].x, resx);
+        atomicAdd(&fwshared[outidx].y, resy);
+      }
+    }
+  }
+
+  __syncthreads();
+  /* write to global memory */
+  for (int k = threadIdx.x; k < N; k += blockDim.x) {
+    const auto i = k % (bin_size_x + rounded_ns);
+    const auto j = k / (bin_size_x + rounded_ns);
+    auto ix      = xoffset - ns_2 + i;
+    auto iy      = yoffset - ns_2 + j;
+    if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) {
+      ix                   = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
+      iy                   = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
+      const auto outidx    = ix + iy * nf1;
+      const auto sharedidx = i + j * (bin_size_x + rounded_ns);
+      atomicAdd(&fw[outidx].x, fwshared[sharedidx].x);
+      atomicAdd(&fw[outidx].y, fwshared[sharedidx].y);
     }
+  }
 }
 
 /* --------------------- 2d Interpolation Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
-template <typename T, int KEREVALMETH>
-__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M,
-                                       int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-      T x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-      T y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-        cuda_complex<T> cnow;
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-        T ker1[MAX_NSPREAD];
-        T ker2[MAX_NSPREAD];
-
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            T kervalue2 = ker2[yy - ystart];
-            for (int xx = xstart; xx <= xend; xx++) {
-                int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                int inidx = ix + iy * nf1;
-                T kervalue1 = ker1[xx - xstart];
-                cnow.x += fw[inidx].x * kervalue1 * kervalue2;
-                cnow.y += fw[inidx].y * kervalue1 * kervalue2;
-            }
-        }
-        c[idxnupts[i]].x = cnow.x;
-        c[idxnupts[i]].y = cnow.y;
+template<typename T, int KEREVALMETH>
+__global__ void interp_2d_nupts_driven(
+    const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const auto x_rescaled     = fold_rescale(x[idxnupts[i]], nf1);
+    const auto y_rescaled     = fold_rescale(y[idxnupts[i]], nf2);
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const auto [ystart, yend] = interval(ns, y_rescaled);
+
+    T x1 = (T)xstart - x_rescaled;
+    T y1 = (T)ystart - y_rescaled;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
     }
+
+    cuda_complex<T> cnow{0, 0};
+    for (int yy = ystart; yy <= yend; yy++) {
+      const T kervalue2 = ker2[yy - ystart];
+      for (int xx = xstart; xx <= xend; xx++) {
+        const auto ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+        const auto iy        = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
+        const auto inidx     = ix + iy * nf1;
+        const auto kervalue1 = ker1[xx - xstart];
+        cnow.x += fw[inidx].x * kervalue1 * kervalue2;
+        cnow.y += fw[inidx].y * kervalue1 * kervalue2;
+      }
+    }
+    c[idxnupts[i]].x = cnow.x;
+    c[idxnupts[i]].y = cnow.y;
+  }
 }
 
 /* Kernels for Subprob Method */
-template <typename T, int KEREVALMETH>
-__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
-                                  int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
-                                  int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
-                                  const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
-                                  const int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy;
-    int outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = (bidx / nbinx) * bin_size_y;
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0));
-
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = k / (bin_size_x + 2 * ceil(ns / 2.0));
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            outidx = ix + iy * nf1;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2);
-            fwshared[sharedidx].x = fw[outidx].x;
-            fwshared[sharedidx].y = fw[outidx].y;
-        }
+template<typename T, int KEREVALMETH>
+__global__ void interp_2d_subprob(
+    const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
+    int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
+    const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
+    const int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
+
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+
+  const auto subpidx     = blockIdx.x;
+  const auto bidx        = subprob_to_bin[subpidx];
+  const auto binsubp_idx = subpidx - subprobstartpts[bidx];
+  const auto ptstart     = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+  const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+
+  const auto xoffset = (bidx % nbinx) * bin_size_x;
+  const auto yoffset = (bidx / nbinx) * bin_size_y;
+
+  const T ns_2f         = ns * T(.5);
+  const auto ns_2       = (ns + 1) / 2;
+  const auto rounded_ns = ns_2 * 2;
+  const int N           = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+
+  for (int k = threadIdx.x; k < N; k += blockDim.x) {
+    int i   = k % (bin_size_x + rounded_ns);
+    int j   = k / (bin_size_x + rounded_ns);
+    auto ix = xoffset - ns_2 + i;
+    auto iy = yoffset - ns_2 + j;
+    if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) {
+      ix                    = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
+      iy                    = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
+      const auto outidx     = ix + int(iy * nf1);
+      const auto sharedidx  = i + j * (bin_size_x + rounded_ns);
+      fwshared[sharedidx].x = fw[outidx].x;
+      fwshared[sharedidx].y = fw[outidx].y;
     }
-    __syncthreads();
-
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    T x_rescaled, y_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-      x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-      y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-
-        T x1 = (T)xstart + xoffset - x_rescaled;
-        T y1 = (T)ystart + yoffset - y_rescaled;
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            T kervalue2 = ker2[yy - ystart];
-            for (int xx = xstart; xx <= xend; xx++) {
-                ix = xx + ceil(ns / 2.0);
-                iy = yy + ceil(ns / 2.0);
-                outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2);
-                T kervalue1 = ker1[xx - xstart];
-                cnow.x += fwshared[outidx].x * kervalue1 * kervalue2;
-                cnow.y += fwshared[outidx].y * kervalue1 * kervalue2;
-            }
-        }
-        c[idxnupts[idx]] = cnow;
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
+    int idx               = ptstart + i;
+    const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
+    cuda_complex<T> cnow{0, 0};
+
+    auto [xstart, xend] = interval(ns, x_rescaled);
+    auto [ystart, yend] = interval(ns, y_rescaled);
+
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+
+    xstart -= xoffset;
+    ystart -= yoffset;
+    xend -= xoffset;
+    yend -= yoffset;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+    }
+
+    for (int yy = ystart; yy <= yend; yy++) {
+      const auto kervalue2 = ker2[yy - ystart];
+      for (int xx = xstart; xx <= xend; xx++) {
+        const auto ix        = xx + ns_2;
+        const auto iy        = yy + ns_2;
+        const auto outidx    = ix + iy * (bin_size_x + rounded_ns);
+        const auto kervalue1 = ker1[xx - xstart];
+        cnow.x += fwshared[outidx].x * kervalue1 * kervalue2;
+        cnow.y += fwshared[outidx].y * kervalue1 * kervalue2;
+      }
     }
+    c[idxnupts[idx]] = cnow;
+  }
 }
 
 } // namespace spreadinterp
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index f5661b1dd..e7ce65b52 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -221,18 +221,68 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size
   return adjusted_ns * sizeof(cuda_complex<T>);
 }
 
+// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize
+template<typename T> int find_bin_size(std::size_t MemSize, int dim, int ns) {
+  int binsize = 1; // Start with the smallest possible bin size
+
+  while (true) {
+    // Calculate the shared memory required for the current bin_size_x and bin_size_y
+    std::size_t required_memory =
+        shared_memory_required<T>(dim, ns, binsize, binsize, binsize);
+
+    // Check if the required memory is less than the available memory
+    if (required_memory > MemSize) {
+      // If the condition is met, return the current bin_size_x
+      return binsize - 1;
+    }
+
+    // Increment bin_size_x for the next iteration
+    binsize++;
+  }
+}
+
 template<typename T>
 void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
   int shared_mem_per_block{}, device_id{};
   switch (dim) {
   case 1: {
-    switch (opts->gpu_method) {
-    case 1:
-      opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex;
-      break;
-    case 0:
-    case 2:
-      if (opts->gpu_binsizex < 0) {
+    if (opts->gpu_binsizex < 0) {
+      cudaGetDevice(&device_id);
+      if (const auto err = cudaGetLastError(); err != cudaSuccess) {
+        throw std::runtime_error(cudaGetErrorString(err));
+      }
+      cudaDeviceGetAttribute(&shared_mem_per_block,
+                             cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+      if (const auto err = cudaGetLastError(); err != cudaSuccess) {
+        throw std::runtime_error(cudaGetErrorString(err));
+      }
+      const int bin_size =
+          shared_mem_per_block / sizeof(cuda_complex<T>) - ((ns + 1) / 2) * 2;
+      // find the power of 2 that is less than bin_size
+      // this makes the bin_size use the maximum shared memory available
+      opts->gpu_binsizex             = bin_size;
+      const auto shared_mem_required = shared_memory_required<T>(
+          dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez);
+      //      printf("binsizex: %d, shared_mem_required %ld (bytes)\n",
+      //      opts->gpu_binsizex,
+      //             shared_mem_required);
+    }
+    opts->gpu_binsizey = 1;
+    opts->gpu_binsizez = 1;
+  } break;
+  case 2: {
+    if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0) {
+      switch (opts->gpu_method) {
+      case 0:
+      case 2: {
+        opts->gpu_binsizex = 32;
+        opts->gpu_binsizey = 32;
+        // fall through otherwise
+        if (opts->gpu_method && ns > 2) {
+          break;
+        }
+      }
+      case 1: {
         cudaGetDevice(&device_id);
         if (const auto err = cudaGetLastError(); err != cudaSuccess) {
           throw std::runtime_error(cudaGetErrorString(err));
@@ -242,22 +292,17 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
         if (const auto err = cudaGetLastError(); err != cudaSuccess) {
           throw std::runtime_error(cudaGetErrorString(err));
         }
-        const int bin_size =
-            shared_mem_per_block / sizeof(cuda_complex<T>) - ((ns + 1) / 2) * 2;
-        // find the power of 2 that is less than bin_size
-        const int exponent = std::log2(bin_size);
-        opts->gpu_binsizex = 1 << (exponent - 1);
-        //        printf("bin_size: %d, gpu_binsizex: %d\n", bin_size,
-        //        opts->gpu_binsizex);
+
+        const auto binsize = find_bin_size<T>(shared_mem_per_block, dim, ns);
+        opts->gpu_binsizex = binsize;
+        opts->gpu_binsizey = binsize;
+      } break;
       }
-      break;
     }
-    opts->gpu_binsizey = 1;
-    opts->gpu_binsizez = 1;
-  } break;
-  case 2: {
-    opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex;
-    opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey;
+    //      const auto shared_mem_required = shared_memory_required<T>(
+    //          dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez);
+    //      printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n",
+    //             opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required);
     opts->gpu_binsizez = 1;
   } break;
   case 3: {