Skip to content

Commit

Permalink
Optimized 1D and 2D
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia committed Jul 15, 2024
1 parent 60f4780 commit 35dcc66
Show file tree
Hide file tree
Showing 9 changed files with 407 additions and 365 deletions.
6 changes: 3 additions & 3 deletions include/cufinufft/contrib/ker_horner_allw_loop.inc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
for (int i=0; i<3; i++) {
ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==4) {
constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02};
Expand All @@ -32,7 +32,7 @@ for (int i=0; i<3; i++) {
constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01};
constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00};
for (int i=0; i<4; i++) {
ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==5) {
constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02};
Expand All @@ -45,7 +45,7 @@ for (int i=0; i<3; i++) {
constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00};
constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00};
for (int i=0; i<5; i++) {
ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==6) {
constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};
Expand Down
13 changes: 13 additions & 0 deletions include/cufinufft/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,19 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
}
return sqrt(nrm);
}

#ifdef __CUDA_ARCH__
__forceinline__ __device__ auto interval(const int ns, const float x) {
const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x));
return int2{xstart, xend};
}
__forceinline__ __device__ auto interval(const int ns, const double x) {
const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
const auto xend = __double2int_rd(__fma_rd(ns, .5, x));
return int2{xstart, xend};
}
#endif
} // namespace utils
} // namespace cufinufft

Expand Down
23 changes: 13 additions & 10 deletions perftest/cuda/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,13 @@ def build_args(args):
# example command to run:
# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
# example arguments
args = {"--prec": "f",
"--n_runs": "10",
args = {"--prec": "d",
"--n_runs": "5",
"--method": "0",
"--sort": "1",
"--N1": "16777216",
# "--N2": "256",
# "--N1": "256",
# "--N2": "256",
# "--N1": "16777216",
"--N1": "256",
"--N2": "256",
# "--N3": "256",
"--kerevalmethod": "1",
"--M": "1E8",
Expand Down Expand Up @@ -93,6 +92,10 @@ def build_args(args):
conf = [x for x in stdout.splitlines() if x.startswith("#")]
print('\n'.join(conf))
stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7]
if stdout[0].startswith("bin"):
print(stdout[0])
stdout = stdout[1:]

stdout = '\n'.join(stdout)
# convert stdout to a dataframe from csv string
dt = pd.read_csv(io.StringIO(stdout), sep=',')
Expand Down Expand Up @@ -153,7 +156,7 @@ def build_args(args):
min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
print(min_val, max_val)
plt.ylim(min_val * .99, max_val * 1.01)
plt.ylim(min_val * .90, max_val * 1.1)
# plt.ylim(.8, 1.2)

# Calculate the smallest power of 10
Expand All @@ -163,15 +166,15 @@ def build_args(args):
# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin

# plot an horizontal line at 1 with label "GM"
plt.axhline(y=1, color='k', linestyle='--', label='GM')
# plt.axhline(y=1, color='k', linestyle='--', label='GM')
plt.xlabel('Tolerance')
plt.ylabel('Throughput (% of GM)')
plt.ylabel('Throughput')
plt.title('Throughput by Tolerance and Method')
plt.legend(title='Method')
plt.tight_layout()
plt.show()
plt.xlabel("Tolerance")
plt.ylabel("Points/s (% of GM)")
plt.ylabel("Points/s")
plt.savefig("bench.png")
plt.savefig("bench.svg")
plt.savefig("bench.pdf")
Expand Down
1 change: 1 addition & 0 deletions src/cuda/1d/spread1d_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)

if (d_plan->opts.gpu_kerevalmeth) {
for (int t = 0; t < blksize; t++) {

cufinufft_set_shared_memory(spread_1d_subprob<T, 1>, 1, *d_plan);
RETURN_IF_CUDA_ERROR
spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
Expand Down
43 changes: 11 additions & 32 deletions src/cuda/1d/spreadinterp1d.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,15 @@ template<typename T, int KEREVALMETH>
__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c,
cuda_complex<T> *fw, int M, int ns, int nf1, T es_c,
T es_beta, T sigma, const int *idxnupts) {

// dynamic stack allocation to reduce stack usage
auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns);

for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
i += blockDim.x * gridDim.x) {
const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
const auto cnow = c[idxnupts[i]];
const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept {
if constexpr (std::is_same_v<T, float>) {
const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled));
const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled));
return int2{xstart, xend};
}
if constexpr (std::is_same_v<T, double>) {
const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled));
const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled));
return int2{xstart, xend};
}
}();
const T x1 = (T)xstart - x_rescaled;
const auto [xstart, xend] = interval(ns, x_rescaled);
const T x1 = (T)xstart - x_rescaled;
if constexpr (KEREVALMETH == 1)
eval_kernel_vec_horner(ker1, x1, ns, sigma);
else
Expand Down Expand Up @@ -126,27 +115,17 @@ __global__ void spread_1d_subprob(
for (int i = threadIdx.x; i < N; i += blockDim.x) {
fwshared[i] = {0, 0};
}

const T ns_2f = ns * T(.5);

__syncthreads();

for (auto i = threadIdx.x; i < nupts; i += blockDim.x) {
const auto idx = ptstart + i;
const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
const auto cnow = c[idxnupts[idx]];

const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept {
if constexpr (std::is_same_v<T, float>) {
const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled));
const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled));
return int2{xstart, xend};
}
if constexpr (std::is_same_v<T, double>) {
const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled));
const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled));
return int2{xstart, xend};
}
}();

const T x1 = T(xstart + xoffset) - x_rescaled;
const auto idx = ptstart + i;
const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
const auto cnow = c[idxnupts[idx]];
const auto [xstart, xend] = interval(ns, x_rescaled);
const T x1 = T(xstart + xoffset) - x_rescaled;
if constexpr (KEREVALMETH == 1)
eval_kernel_vec_horner(ker1, x1, ns, sigma);
else
Expand Down
16 changes: 8 additions & 8 deletions src/cuda/2d/interp2d_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
#include <cuComplex.h>
#include <cufinufft/contrib/helper_cuda.h>

#include <cufinufft/common.h>
#include <cufinufft/memtransfer.h>
#include <cufinufft/spreadinterp.h>

using namespace cufinufft::memtransfer;
using namespace cufinufft::common;

#include "spreadinterp2d.cuh"

Expand Down Expand Up @@ -120,17 +122,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
int *d_subprob_to_bin = d_plan->subprob_to_bin;
int totalnumsubprob = d_plan->totalnumsubprob;

T sigma = d_plan->opts.upsampfac;
size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
(bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);

if (sharedplanorysize > 49152) {
std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
return FINUFFT_ERR_INSUFFICIENT_SHMEM;
}
T sigma = d_plan->opts.upsampfac;
const auto sharedplanorysize =
shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);

if (d_plan->opts.gpu_kerevalmeth) {
for (int t = 0; t < blksize; t++) {
cufinufft_set_shared_memory(interp_2d_subprob<T, 1>, 2, *d_plan);
interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
Expand All @@ -140,6 +139,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
}
} else {
for (int t = 0; t < blksize; t++) {
cufinufft_set_shared_memory(interp_2d_subprob<T, 0>, 2, *d_plan);
interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
Expand Down
15 changes: 8 additions & 7 deletions src/cuda/2d/spread2d_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <thrust/device_ptr.h>
#include <thrust/scan.h>

#include <cufinufft/common.h>
#include <cufinufft/memtransfer.h>
#include <cufinufft/precision_independent.h>
#include <cufinufft/spreadinterp.h>
Expand Down Expand Up @@ -273,16 +274,14 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,

T sigma = d_plan->opts.upsampfac;

size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) *
(bin_size_y + 2 * (int)ceil(ns / 2.0)) *
sizeof(cuda_complex<T>);
if (sharedplanorysize > 49152) {
std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
return FINUFFT_ERR_INSUFFICIENT_SHMEM;
}
const auto sharedplanorysize =
shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);

if (d_plan->opts.gpu_kerevalmeth) {
for (int t = 0; t < blksize; t++) {
cufinufft_set_shared_memory(spread_2d_subprob<T, 1>, 2, *d_plan);
RETURN_IF_CUDA_ERROR
spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
Expand All @@ -292,6 +291,8 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
}
} else {
for (int t = 0; t < blksize; t++) {
cufinufft_set_shared_memory(spread_2d_subprob<T, 0>, 2, *d_plan);
RETURN_IF_CUDA_ERROR
spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
Expand Down
Loading

0 comments on commit 35dcc66

Please sign in to comment.