otpimized nupts driven

flatironinstitute · Jul 12, 2024 · 60f4780 · 60f4780
1 parent 907797c
commit 60f4780
Show file tree

Hide file tree

Showing 7 changed files with 199 additions and 92 deletions.
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
@@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
   return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
 }
 
-#define RETURN_IF_CUDA_ERROR                                         \
-  {                                                                  \
-    cudaError_t err = cudaGetLastError();                            \
-    if (err != cudaSuccess) {                                        \
-      printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
-      return FINUFFT_ERR_CUDA_FAILURE;                               \
-    }                                                                \
+#define RETURN_IF_CUDA_ERROR                                                         \
+  {                                                                                  \
+    cudaError_t err = cudaGetLastError();                                            \
+    if (err != cudaSuccess) {                                                        \
+      printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
+             __FILE__, __LINE__);                                                    \
+      return FINUFFT_ERR_CUDA_FAILURE;                                               \
+    }                                                                                \
   }
 
 #define CUDA_FREE_AND_NULL(val, stream, pool_supported)                              \

diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc
@@ -8,7 +8,9 @@
     constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01};
     constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00};
     constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00};
-    for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i])))));
+    for (int i = 0; i < 2; i++) {
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, c5[i], c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    }
   } else if (w==3) {
     constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02};
     constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02};
@@ -17,7 +19,9 @@
     constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01};
     constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
     constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
-    for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i]))))));
+for (int i=0; i<3; i++) {
+    ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+}
   } else if (w==4) {
     constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02};
     constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03};
@@ -27,7 +31,9 @@
     constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01};
     constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01};
     constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00};
-    for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i])))))));
+    for (int i=0; i<4; i++) {
+        ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    }
   } else if (w==5) {
     constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02};
     constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03};
@@ -38,7 +44,9 @@
     constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01};
     constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00};
     constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00};
-    for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i]))))))));
+    for (int i=0; i<5; i++) {
+        ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    }
   } else if (w==6) {
     constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};
     constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03};
@@ -50,7 +58,9 @@
     constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01};
     constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00};
     constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00};
-    for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
+    for (int i=0; i<6; i++) {
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c9[i], c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    }
   } else if (w==7) {
     constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03};
     constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04};
@@ -63,7 +73,9 @@
     constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01};
     constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01};
     constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00};
-    for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
+    for (int i=0; i<7; i++) {
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    }
   } else if (w==8) {
     constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03};
     constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04};
@@ -76,7 +88,9 @@
     constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02};
     constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01};
     constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01};
-    for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
+    for (int i = 0; i < 8; i++) {
+        ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]);
+    }
   } else if (w==9) {
     constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04};
     constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04};

diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
@@ -9,10 +9,39 @@
 namespace cufinufft {
 namespace spreadinterp {
 
-template<typename T> static __forceinline__ __device__ T fold_rescale(T x, int N) {
-  static constexpr const auto x2pi = T(0.159154943091895345554011992339482617);
-  const T result                   = x * x2pi + T(0.5);
-  return (result - floor(result)) * T(N);
+template<typename T>
+constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
+  constexpr const auto x2pi = T(0.159154943091895345554011992339482617);
+  constexpr const auto half = T(0.5);
+#if defined(__CUDA_ARCH__)
+  if constexpr (std::is_same_v<T, float>) {
+    auto result = __fmaf_rn(x, x2pi, half);
+    result      = __fsub_rd(result, truncf(result));
+    return __fmul_rd(result, static_cast<T>(N));
+  } else if constexpr (std::is_same_v<T, double>) {
+    auto result = __fma_rn(x, x2pi, half);
+    result      = __dsub_rd(result, trunc(result));
+    return __dmul_rd(result, static_cast<T>(N));
+  } else {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Only float and double are supported.");
+  }
+#else
+  const auto result = std::fma(x, x2pi, half);
+  return (result - std::trunc(result)) * static_cast<T>(N);
+#endif
+}
+
+template<typename T>
+static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) {
+  if constexpr (std::is_same_v<T, float>) {
+    return __fmaf_rn(a, b, c);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return __fma_rn(a, b, c);
+  } else {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Only float and double are supported.");
+  }
 }
 
 template<typename T>
@@ -23,11 +52,11 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
    approximation to prolate spheroidal wavefunction (PSWF) of order 0.
    This is the "reference implementation", used by eg common/onedim_* 2/17/17 */
 {
-  if (abs(x) >= opts.ES_halfwidth)
+  if (abs(x) >= T(opts.ES_halfwidth))
     // if spreading/FT careful, shouldn't need this if, but causes no speed hit
     return 0.0;
   else
-    return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x));
+    return exp(T(opts.ES_beta) * sqrt(T(1.0) - T(opts.ES_c) * x * x));
 }
 
 template<typename T>
@@ -53,7 +82,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons
    This is the current evaluation method, since it's faster (except i7 w=16).
    Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 {
-  T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1]
+#ifdef __CUDA_ARCH__
+  __builtin_assume(w >= 2);
+  if constexpr (std::is_same_v<T, float>) {
+    __builtin_assume(w <= 7);
+  }
+  if constexpr (std::is_same_v<T, double>) {
+    __builtin_assume(w <= 16);
+  }
+#endif
+  const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1]
+  //  T z = 2 * x + w - 1.0;
   // insert the auto-generated code which expects z, w args, writes to ker...
   if (upsampfac == 2.0) { // floating point equality is fine here
     using FLT           = T;

diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py
@@ -41,10 +41,12 @@ def build_args(args):
         "--n_runs": "10",
         "--method": "0",
         "--sort": "1",
-        # "--N1": "16777216",
-        "--N1": "256",
-        "--N2": "256",
+        "--N1": "16777216",
+        # "--N2": "256",
+        # "--N1": "256",
+        # "--N2": "256",
         # "--N3": "256",
+        "--kerevalmethod": "1",
         "--M": "1E8",
         "--tol": "1E-6"}
 # iterate over tol from 1E-6 to 1E-1
@@ -135,21 +137,21 @@ def build_args(args):
 pivot_df = df.pivot(index='tolerance', columns='method')
 # print(pivot_df)
 # scale the throughput SM by GM
-pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM']
+# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM']
 # pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM']
 # scale setpts SM by GM
-pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM']
+# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM']
 # pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM']
 # remove the GM column
-pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True)
+# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True)
 pivot_df.drop(('exec', 'GM'), axis=1, inplace=True)
 pivot_df.drop(('exec', 'SM'), axis=1, inplace=True)
 print(pivot_df)
 # Plot
 pivot_df.plot(kind='bar', figsize=(10, 7))
 # Find the minimum throughput value
-min_val = min(pivot_df[('throughput', 'SM')].min(), 1)
-max_val = max(pivot_df[('throughput', 'SM')].max(), 0)
+min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
+max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
 print(min_val, max_val)
 plt.ylim(min_val * .99, max_val * 1.01)
 # plt.ylim(.8, 1.2)