Skip to content

Commit

Permalink
otpimized nupts driven
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia committed Jul 12, 2024
1 parent 907797c commit 60f4780
Show file tree
Hide file tree
Showing 7 changed files with 199 additions and 92 deletions.
15 changes: 8 additions & 7 deletions include/cufinufft/contrib/helper_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
}

#define RETURN_IF_CUDA_ERROR \
{ \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
return FINUFFT_ERR_CUDA_FAILURE; \
} \
#define RETURN_IF_CUDA_ERROR \
{ \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
__FILE__, __LINE__); \
return FINUFFT_ERR_CUDA_FAILURE; \
} \
}

#define CUDA_FREE_AND_NULL(val, stream, pool_supported) \
Expand Down
28 changes: 21 additions & 7 deletions include/cufinufft/contrib/ker_horner_allw_loop.inc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01};
constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00};
constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00};
for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i])))));
for (int i = 0; i < 2; i++) {
ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, c5[i], c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==3) {
constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02};
constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02};
Expand All @@ -17,7 +19,9 @@
constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01};
constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i]))))));
for (int i=0; i<3; i++) {
ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==4) {
constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02};
constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03};
Expand All @@ -27,7 +31,9 @@
constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01};
constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01};
constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00};
for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i])))))));
for (int i=0; i<4; i++) {
ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==5) {
constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02};
constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03};
Expand All @@ -38,7 +44,9 @@
constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01};
constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00};
constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00};
for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i]))))))));
for (int i=0; i<5; i++) {
ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==6) {
constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};
constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03};
Expand All @@ -50,7 +58,9 @@
constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01};
constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00};
constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00};
for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
for (int i=0; i<6; i++) {
ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c9[i], c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==7) {
constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03};
constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04};
Expand All @@ -63,7 +73,9 @@
constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01};
constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01};
constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00};
for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
for (int i=0; i<7; i++) {
ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==8) {
constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03};
constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04};
Expand All @@ -76,7 +88,9 @@
constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02};
constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01};
constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01};
for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
for (int i = 0; i < 8; i++) {
ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
}
} else if (w==9) {
constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04};
constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04};
Expand Down
53 changes: 46 additions & 7 deletions include/cufinufft/spreadinterp.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,39 @@
namespace cufinufft {
namespace spreadinterp {

template<typename T> static __forceinline__ __device__ T fold_rescale(T x, int N) {
static constexpr const auto x2pi = T(0.159154943091895345554011992339482617);
const T result = x * x2pi + T(0.5);
return (result - floor(result)) * T(N);
template<typename T>
constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
constexpr const auto x2pi = T(0.159154943091895345554011992339482617);
constexpr const auto half = T(0.5);
#if defined(__CUDA_ARCH__)
if constexpr (std::is_same_v<T, float>) {
auto result = __fmaf_rn(x, x2pi, half);
result = __fsub_rd(result, truncf(result));
return __fmul_rd(result, static_cast<T>(N));
} else if constexpr (std::is_same_v<T, double>) {
auto result = __fma_rn(x, x2pi, half);
result = __dsub_rd(result, trunc(result));
return __dmul_rd(result, static_cast<T>(N));
} else {
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
"Only float and double are supported.");
}
#else
const auto result = std::fma(x, x2pi, half);
return (result - std::trunc(result)) * static_cast<T>(N);
#endif
}

template<typename T>
static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) {
if constexpr (std::is_same_v<T, float>) {
return __fmaf_rn(a, b, c);
} else if constexpr (std::is_same_v<T, double>) {
return __fma_rn(a, b, c);
} else {
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
"Only float and double are supported.");
}
}

template<typename T>
Expand All @@ -23,11 +52,11 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
approximation to prolate spheroidal wavefunction (PSWF) of order 0.
This is the "reference implementation", used by eg common/onedim_* 2/17/17 */
{
if (abs(x) >= opts.ES_halfwidth)
if (abs(x) >= T(opts.ES_halfwidth))
// if spreading/FT careful, shouldn't need this if, but causes no speed hit
return 0.0;
else
return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x));
return exp(T(opts.ES_beta) * sqrt(T(1.0) - T(opts.ES_c) * x * x));
}

template<typename T>
Expand All @@ -53,7 +82,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons
This is the current evaluation method, since it's faster (except i7 w=16).
Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
{
T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1]
#ifdef __CUDA_ARCH__
__builtin_assume(w >= 2);
if constexpr (std::is_same_v<T, float>) {
__builtin_assume(w <= 7);
}
if constexpr (std::is_same_v<T, double>) {
__builtin_assume(w <= 16);
}
#endif
const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1]
// T z = 2 * x + w - 1.0;
// insert the auto-generated code which expects z, w args, writes to ker...
if (upsampfac == 2.0) { // floating point equality is fine here
using FLT = T;
Expand Down
18 changes: 10 additions & 8 deletions perftest/cuda/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,12 @@ def build_args(args):
"--n_runs": "10",
"--method": "0",
"--sort": "1",
# "--N1": "16777216",
"--N1": "256",
"--N2": "256",
"--N1": "16777216",
# "--N2": "256",
# "--N1": "256",
# "--N2": "256",
# "--N3": "256",
"--kerevalmethod": "1",
"--M": "1E8",
"--tol": "1E-6"}
# iterate over tol from 1E-6 to 1E-1
Expand Down Expand Up @@ -135,21 +137,21 @@ def build_args(args):
pivot_df = df.pivot(index='tolerance', columns='method')
# print(pivot_df)
# scale the throughput SM by GM
pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM']
# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM']
# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM']
# scale setpts SM by GM
pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM']
# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM']
# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM']
# remove the GM column
pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True)
# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True)
pivot_df.drop(('exec', 'GM'), axis=1, inplace=True)
pivot_df.drop(('exec', 'SM'), axis=1, inplace=True)
print(pivot_df)
# Plot
pivot_df.plot(kind='bar', figsize=(10, 7))
# Find the minimum throughput value
min_val = min(pivot_df[('throughput', 'SM')].min(), 1)
max_val = max(pivot_df[('throughput', 'SM')].max(), 0)
min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
print(min_val, max_val)
plt.ylim(min_val * .99, max_val * 1.01)
# plt.ylim(.8, 1.2)
Expand Down
Loading

0 comments on commit 60f4780

Please sign in to comment.