diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index c9c5e2ca2..1178a8544 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -20,7 +20,7 @@ constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; for (int i=0; i<3; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==4) { constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; @@ -32,7 +32,7 @@ for (int i=0; i<3; i++) { constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; for (int i=0; i<4; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==5) { constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; @@ -45,7 +45,7 @@ for (int i=0; i<3; i++) { constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; for (int i=0; i<5; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==6) { constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 3455b99c0..b0a77aec7 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -68,6 +68,19 @@ template T infnorm(int n, std::complex *a) { } return sqrt(nrm); } + +#ifdef __CUDA_ARCH__ +__forceinline__ __device__ auto interval(const int ns, const float x) { + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + return int2{xstart, xend}; +} +__forceinline__ __device__ auto interval(const int ns, const double x) { + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); + return int2{xstart, xend}; +} +#endif } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index dbcaed87f..db7e73873 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,14 +37,13 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "f", - "--n_runs": "10", +args = {"--prec": "d", + "--n_runs": "5", "--method": "0", "--sort": "1", - "--N1": "16777216", - # "--N2": "256", - # "--N1": "256", - # "--N2": "256", + # "--N1": "16777216", + "--N1": "256", + "--N2": "256", # "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", @@ -93,6 +92,10 @@ def build_args(args): conf = [x for x in stdout.splitlines() if x.startswith("#")] print('\n'.join(conf)) stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] + stdout = '\n'.join(stdout) # convert stdout to a dataframe from csv string dt = pd.read_csv(io.StringIO(stdout), sep=',') @@ -153,7 +156,7 @@ def build_args(args): min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) print(min_val, max_val) -plt.ylim(min_val * .99, max_val * 1.01) +plt.ylim(min_val * .90, max_val * 1.1) # plt.ylim(.8, 1.2) # Calculate the smallest power of 10 @@ -163,15 +166,15 @@ def build_args(args): # plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin # plot an horizontal line at 1 with label "GM" -plt.axhline(y=1, color='k', linestyle='--', label='GM') +# plt.axhline(y=1, color='k', linestyle='--', label='GM') plt.xlabel('Tolerance') -plt.ylabel('Throughput (% of GM)') +plt.ylabel('Throughput') plt.title('Throughput by Tolerance and Method') plt.legend(title='Method') plt.tight_layout() plt.show() plt.xlabel("Tolerance") -plt.ylabel("Points/s (% of GM)") +plt.ylabel("Points/s") plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index e958bfea3..4e7f4ea0b 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -268,6 +268,7 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 68656c124..f94ffd7eb 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -23,26 +23,15 @@ template __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - + // dynamic stack allocation to reduce stack usage auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); const auto cnow = c[idxnupts[i]]; - const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { - if constexpr (std::is_same_v) { - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); - return int2{xstart, xend}; - } - if constexpr (std::is_same_v) { - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); - return int2{xstart, xend}; - } - }(); - const T x1 = (T)xstart - x_rescaled; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else @@ -126,27 +115,17 @@ __global__ void spread_1d_subprob( for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; } + + const T ns_2f = ns * T(.5); + __syncthreads(); for (auto i = threadIdx.x; i < nupts; i += blockDim.x) { - const auto idx = ptstart + i; - const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - const auto cnow = c[idxnupts[idx]]; - - const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { - if constexpr (std::is_same_v) { - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); - return int2{xstart, xend}; - } - if constexpr (std::is_same_v) { - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); - return int2{xstart, xend}; - } - }(); - - const T x1 = T(xstart + xoffset) - x_rescaled; + const auto idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto cnow = c[idxnupts[idx]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = T(xstart + xoffset) - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index 533788482..eda0d579b 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -4,10 +4,12 @@ #include #include +#include #include #include using namespace cufinufft::memtransfer; +using namespace cufinufft::common; #include "spreadinterp2d.cuh" @@ -120,17 +122,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int *d_subprob_to_bin = d_plan->subprob_to_bin; int totalnumsubprob = d_plan->totalnumsubprob; - T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + T sigma = d_plan->opts.upsampfac; + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -140,6 +139,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 69b2ba956..d361791b0 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -273,16 +274,14 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * - (bin_size_y + 2 * (int)ceil(ns / 2.0)) * - sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -292,6 +291,8 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 558984ea1..62a430ca5 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -15,314 +15,314 @@ namespace spreadinterp { /* ------------------------ 2d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - int xstart, ystart, xend, yend; - int xx, yy, ix, iy; - int outidx; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - T kervalue1, kervalue2; - cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - cnow = c[idxnupts[i]]; - - xstart = ceil(x_rescaled - ns / 2.0); - ystart = ceil(y_rescaled - ns / 2.0); - xend = floor(x_rescaled + ns / 2.0); - yend = floor(y_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (yy = ystart; yy <= yend; yy++) { - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - outidx = ix + iy * nf1; - kervalue1 = ker1[xx - xstart]; - kervalue2 = ker2[yy - ystart]; - atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_nupts_driven( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto cnow = c[idxnupts[i]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + const auto x1 = (T)xstart - x_rescaled; + const auto y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + for (auto yy = ystart; yy <= yend; yy++) { + for (auto xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto outidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + const auto kervalue2 = ker2[yy - ystart]; + atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); + atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); + } + } + } } /* Kernels for SubProb Method */ // SubProb properties -template -__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny, +template +__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, + int bin_size_y, int nbinx, int nbiny, int *bin_size, T *x, T *y, int *sortidx) { - int binidx, binx, biny; - int oldidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - if (binx >= nbinx || biny >= nbiny) { - sortidx[i] = -biny; - } + int binidx, binx, biny; + int oldidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + if (binx >= nbinx || biny >= nbiny) { + sortidx[i] = -biny; } + } } -template -__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, - const int *bin_startpts, const int *sortidx, const T *x, - const T *y, int *index, int nf1, int nf2) { - int binx, biny; - int binidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_2d( + int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, const int *bin_startpts, + const int *sortidx, const T *x, const T *y, int *index, int nf1, int nf2) { + int binx, biny; + int binidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } -template -__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; - } - __syncthreads(); - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow = c[idxnupts[idx]]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - iy = yy + ceil(ns / 2.0); - if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0) - break; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - T kervalue2 = ker2[yy - ystart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_subprob( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + const int subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const int xoffset = (bidx % nbinx) * bin_size_x; + const int yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + const auto cnow = c[idxnupts[idx]]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } - __syncthreads(); - /* write to global memory */ - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); - } + for (int yy = ystart; yy <= yend; yy++) { + const auto iy = yy + ns_2; + if (iy >= (bin_size_y + rounded_ns) || iy < 0) break; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + if (ix >= (bin_size_x + rounded_ns) || ix < 0) break; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart]; + const auto resx = cnow.x * kervalue; + const auto resy = cnow.y * kervalue; + atomicAdd(&fwshared[outidx].x, resx); + atomicAdd(&fwshared[outidx].y, resy); + } + } + } + + __syncthreads(); + /* write to global memory */ + for (int k = threadIdx.x; k < N; k += blockDim.x) { + const auto i = k % (bin_size_x + rounded_ns); + const auto j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + iy * nf1; + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); + atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); } + } } /* --------------------- 2d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - T y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - int inidx = ix + iy * nf1; - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[inidx].x * kervalue1 * kervalue2; - cnow.y += fw[inidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; +template +__global__ void interp_2d_nupts_driven( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + T x1 = (T)xstart - x_rescaled; + T y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + cuda_complex cnow{0, 0}; + for (int yy = ystart; yy <= yend; yy++) { + const T kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto inidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fw[inidx].x * kervalue1 * kervalue2; + cnow.y += fw[inidx].y * kervalue1 * kervalue2; + } + } + c[idxnupts[i]].x = cnow.x; + c[idxnupts[i]].y = cnow.y; + } } /* Kernels for Subprob Method */ -template -__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; - } +template +__global__ void interp_2d_subprob( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + const auto subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const auto xoffset = (bidx % nbinx) * bin_size_x; + const auto yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + + for (int k = threadIdx.x; k < N; k += blockDim.x) { + int i = k % (bin_size_x + rounded_ns); + int j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + int(iy * nf1); + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + fwshared[sharedidx].x = fw[outidx].x; + fwshared[sharedidx].y = fw[outidx].y; } - __syncthreads(); - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow.x = 0.0; - cnow.y = 0.0; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - iy = yy + ceil(ns / 2.0); - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; - cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[idx]] = cnow; + } + __syncthreads(); + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + cuda_complex cnow{0, 0}; + + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + } + + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + const auto iy = yy + ns_2; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; + cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; + } } + c[idxnupts[idx]] = cnow; + } } } // namespace spreadinterp diff --git a/src/cuda/common.cu b/src/cuda/common.cu index f5661b1dd..e7ce65b52 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -221,18 +221,68 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } +// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize +template int find_bin_size(std::size_t MemSize, int dim, int ns) { + int binsize = 1; // Start with the smallest possible bin size + + while (true) { + // Calculate the shared memory required for the current bin_size_x and bin_size_y + std::size_t required_memory = + shared_memory_required(dim, ns, binsize, binsize, binsize); + + // Check if the required memory is less than the available memory + if (required_memory > MemSize) { + // If the condition is met, return the current bin_size_x + return binsize - 1; + } + + // Increment bin_size_x for the next iteration + binsize++; + } +} + template void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { int shared_mem_per_block{}, device_id{}; switch (dim) { case 1: { - switch (opts->gpu_method) { - case 1: - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; - break; - case 0: - case 2: - if (opts->gpu_binsizex < 0) { + if (opts->gpu_binsizex < 0) { + cudaGetDevice(&device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + const int bin_size = + shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; + // find the power of 2 that is less than bin_size + // this makes the bin_size use the maximum shared memory available + opts->gpu_binsizex = bin_size; + const auto shared_mem_required = shared_memory_required( + dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); + // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", + // opts->gpu_binsizex, + // shared_mem_required); + } + opts->gpu_binsizey = 1; + opts->gpu_binsizez = 1; + } break; + case 2: { + if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0) { + switch (opts->gpu_method) { + case 0: + case 2: { + opts->gpu_binsizex = 32; + opts->gpu_binsizey = 32; + // fall through otherwise + if (opts->gpu_method && ns > 2) { + break; + } + } + case 1: { cudaGetDevice(&device_id); if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); @@ -242,22 +292,17 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } - const int bin_size = - shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - // find the power of 2 that is less than bin_size - const int exponent = std::log2(bin_size); - opts->gpu_binsizex = 1 << (exponent - 1); - // printf("bin_size: %d, gpu_binsizex: %d\n", bin_size, - // opts->gpu_binsizex); + + const auto binsize = find_bin_size(shared_mem_per_block, dim, ns); + opts->gpu_binsizex = binsize; + opts->gpu_binsizey = binsize; + } break; } - break; } - opts->gpu_binsizey = 1; - opts->gpu_binsizez = 1; - } break; - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; + // const auto shared_mem_required = shared_memory_required( + // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); + // printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n", + // opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required); opts->gpu_binsizez = 1; } break; case 3: {