From 81d6e57fb0f515b2ddac5fd57cbb6fe17befecc2 Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Mon, 10 Oct 2022 08:08:08 -0600
Subject: [PATCH 01/11] Merge pull request #1531 from
 tcclevenger/change_staticcrsgraph_typdef_template

Change template type for StaticCrsGraph in BsrMatrix

(cherry picked from commit bb891383b0216cf4b47acffd42611d9d162fab3b)
---
 src/sparse/KokkosSparse_BsrMatrix.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/sparse/KokkosSparse_BsrMatrix.hpp b/src/sparse/KokkosSparse_BsrMatrix.hpp
index a615eff478..12f4dff651 100644
--- a/src/sparse/KokkosSparse_BsrMatrix.hpp
+++ b/src/sparse/KokkosSparse_BsrMatrix.hpp
@@ -390,12 +390,12 @@ class BsrMatrix {
   typedef BsrMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits>
       HostMirror;
   //! Type of the graph structure of the sparse matrix.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
-                                 execution_space, memory_traits, size_type>
+  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
+                                 memory_traits, size_type>
       StaticCrsGraphType;
   //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
-                                 execution_space, memory_traits, size_type>
+  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
+                                 memory_traits, size_type>
       staticcrsgraph_type;
   //! Type of column indices in the sparse matrix.
   typedef typename staticcrsgraph_type::entries_type index_type;

From 7a0259919ef313f23d32ffd08bac93072e9e3e1e Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Tue, 18 Oct 2022 16:01:39 -0600
Subject: [PATCH 02/11] Merge pull request #1568 from
 bartlettroscoe/tril-11152-remove-undefined-tpl-deps

KokkosKernels: Remove listing of undefined TPL deps (trilinos/Trilinos#11152)
(cherry picked from commit 9acf3000dd8e0e4fd656c3b193578babb8b17337)
---
 cmake/Dependencies.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e8b1c6a5e2..4ce5a98dc0 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,6 +1,6 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
         LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
-        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS
+        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS
         TEST_OPTIONAL_TPLS yaml-cpp
 )
 # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in

From cc418f5bae185b3ece508be7fe21c242ec07d545 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 15 Nov 2022 17:06:08 -0700
Subject: [PATCH 03/11] Fix Trilinos issue #11033

(using SpGEMM with nonstandard scalar type, with MKL enabled).
Mirror of Trilinos PR #11278.
---
 src/sparse/KokkosSparse_Utils_mkl.hpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/sparse/KokkosSparse_Utils_mkl.hpp b/src/sparse/KokkosSparse_Utils_mkl.hpp
index b9eb3a9bd2..3b1e28fd84 100644
--- a/src/sparse/KokkosSparse_Utils_mkl.hpp
+++ b/src/sparse/KokkosSparse_Utils_mkl.hpp
@@ -123,16 +123,16 @@ template <typename value_type>
 class MKLSparseMatrix {
   sparse_matrix_t mtx;
 
-  static_assert(mkl_is_supported_value_type<value_type>::value,
-                "Scalar type used in MKLSparseMatrix<value_type> is NOT "
-                "supported by MKL");
-
  public:
   inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {}
 
   // Constructs MKL sparse matrix from KK sparse views (m rows x n cols)
   inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols,
-                         MKL_INT *xadj, MKL_INT *adj, value_type *values);
+                         MKL_INT *xadj, MKL_INT *adj, value_type *values) {
+    throw std::runtime_error(
+        "Scalar type used in MKLSparseMatrix<value_type> is NOT "
+        "supported by MKL");
+  }
 
   // Allows using MKLSparseMatrix directly in MKL calls
   inline operator sparse_matrix_t() const { return mtx; }
@@ -140,7 +140,11 @@ class MKLSparseMatrix {
   // Exports MKL sparse matrix contents into KK views
   inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols,
                           MKL_INT *&rows_start, MKL_INT *&columns,
-                          value_type *&values);
+                          value_type *&values) {
+    throw std::runtime_error(
+        "Scalar type used in MKLSparseMatrix<value_type> is NOT "
+        "supported by MKL");
+  }
 
   inline void destroy() {
     KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx));
@@ -256,4 +260,4 @@ inline void MKLSparseMatrix<Kokkos::complex<double>>::export_data(
 
 #endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
 
-#endif  // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
\ No newline at end of file
+#endif  // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP

From 7a7b5876facb62b4b7ffa579c32682b51d651742 Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Mon, 7 Nov 2022 11:23:16 -0700
Subject: [PATCH 04/11] Merge pull request #1574 from
 vqd8a/add-cusparse-11-trisolve-api

Use new cusparseSpSV for SPTRSV when cuSPARSE enabled with CUDA >= 11.3

(cherry picked from commit b2acb36eed542c1002467f9012f45549db9f84f0)
---
 src/sparse/KokkosSparse_sptrsv_handle.hpp     |  61 ++++++-
 .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp     | 171 +++++++++++++++++-
 2 files changed, 223 insertions(+), 9 deletions(-)

diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp
index 4c9c98d6c1..7933d11a8c 100644
--- a/src/sparse/KokkosSparse_sptrsv_handle.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_handle.hpp
@@ -50,7 +50,7 @@
 #define KOKKOSSPARSE_SPTRSVHANDLE_HPP
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-#include "cusparse.h"
+#include "KokkosSparse_Utils_cusparse.hpp"
 #endif
 
 #if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
@@ -108,6 +108,8 @@ class SPTRSVHandle {
   typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t;
   typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
       int_row_view_t;
+  typedef typename Kokkos::View<int64_t *, HandlePersistentMemorySpace>
+      int64_row_view_t;
   // typedef typename row_lno_persistent_work_view_t::HostMirror
   // row_lno_persistent_work_host_view_t; //Host view type
   typedef typename Kokkos::View<
@@ -154,6 +156,46 @@ class SPTRSVHandle {
       mtx_scalar_view_t;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if (CUDA_VERSION >= 11030)
+  struct cuSparseHandleType {
+    cusparseHandle_t handle;
+    cusparseOperation_t transpose;
+    cusparseSpMatDescr_t matDescr;
+    cusparseDnVecDescr_t vecBDescr, vecBDescr_dummy;
+    cusparseDnVecDescr_t vecXDescr, vecXDescr_dummy;
+    cusparseSpSVDescr_t spsvDescr;
+    void *pBuffer{nullptr};
+
+    cuSparseHandleType(bool transpose_, bool is_lower) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle));
+
+      KOKKOS_CUSPARSE_SAFE_CALL(
+          cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
+
+      if (transpose_) {
+        transpose = CUSPARSE_OPERATION_TRANSPOSE;
+      } else {
+        transpose = CUSPARSE_OPERATION_NON_TRANSPOSE;
+      }
+
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_createDescr(&spsvDescr));
+    }
+
+    ~cuSparseHandleType() {
+      if (pBuffer != nullptr) {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(pBuffer));
+        pBuffer = nullptr;
+      }
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(matDescr));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecBDescr));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecBDescr_dummy));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecXDescr));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecXDescr_dummy));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_destroyDescr(spsvDescr));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(handle));
+    }
+  };
+#else  // CUDA_VERSION < 11030
   struct cuSparseHandleType {
     cusparseHandle_t handle;
     cusparseOperation_t transpose;
@@ -202,6 +244,7 @@ class SPTRSVHandle {
       cusparseDestroy(handle);
     }
   };
+#endif
 
   typedef cuSparseHandleType SPTRSVcuSparseHandleType;
 #endif
@@ -337,6 +380,7 @@ class SPTRSVHandle {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   SPTRSVcuSparseHandleType *cuSPARSEHandle;
   int_row_view_t tmp_int_rowmap;
+  int64_row_view_t tmp_int64_rowmap;
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
@@ -443,7 +487,8 @@ class SPTRSVHandle {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
         ,
         cuSPARSEHandle(nullptr),
-        tmp_int_rowmap()
+        tmp_int_rowmap(),
+        tmp_int64_rowmap()
 #endif
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
         ,
@@ -851,6 +896,18 @@ class SPTRSVHandle {
   }
   int_row_view_t get_int_rowmap_view() { return tmp_int_rowmap; }
   int *get_int_rowmap_ptr() { return tmp_int_rowmap.data(); }
+
+  void allocate_tmp_int64_rowmap(size_type N) {
+    tmp_int64_rowmap = int64_row_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_int64_rowmap"), N);
+  }
+  template <typename RowViewType>
+  int64_t *get_int64_rowmap_ptr_copy(const RowViewType &rowmap) {
+    Kokkos::deep_copy(tmp_int64_rowmap, rowmap);
+    Kokkos::fence();
+    return tmp_int64_rowmap.data();
+  }
+  int64_t *get_int64_rowmap_ptr() { return tmp_int64_rowmap.data(); }
 #endif
 
   bool algm_requires_symb_lvlsched() const {
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
index 61d0dc3ccf..19af87b91e 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
@@ -45,9 +45,8 @@
 #ifndef _KOKKOSSPTRSVCUSPARSE_HPP
 #define _KOKKOSSPTRSVCUSPARSE_HPP
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-#include "cusparse.h"
-#endif
+#include "KokkosSparse_Utils_cusparse.hpp"
+
 namespace KokkosSparse {
 namespace Impl {
 
@@ -60,6 +59,116 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle,
                              ain_nonzero_index_view_type entries,
                              ain_values_scalar_view_type values, bool trans) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if (CUDA_VERSION >= 11030)
+  typedef typename KernelHandle::nnz_lno_t idx_type;
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::scalar_t scalar_type;
+  typedef typename KernelHandle::memory_space memory_space;
+  typedef typename KernelHandle::nnz_scalar_view_t nnz_scalar_view_t;
+
+  const bool is_cuda_space =
+      std::is_same<memory_space, Kokkos::CudaSpace>::value ||
+      std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
+      std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value;
+
+  const bool is_idx_type_supported = std::is_same<idx_type, int>::value ||
+                                     std::is_same<idx_type, int64_t>::value;
+
+  if (!is_cuda_space) {
+    throw std::runtime_error(
+        "KokkosKernels sptrsvcuSPARSE_symbolic: MEMORY IS NOT ALLOCATED IN GPU "
+        "DEVICE for CUSPARSE\n");
+  } else if (!is_idx_type_supported) {
+    throw std::runtime_error(
+        "CUSPARSE requires local ordinals to be integer (32 bits or 64 "
+        "bits).\n");
+  } else {
+    bool is_lower = sptrsv_handle->is_lower_tri();
+    sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower);
+
+    typename KernelHandle::SPTRSVcuSparseHandleType* h =
+        sptrsv_handle->get_cuSparseHandle();
+
+    int64_t nnz = static_cast<int64_t>(entries.extent(0));
+    size_t pBufferSize;
+    void* rm;
+    // NOTE (Oct-29-2022):
+    // cusparseCreateCsr only supports the same sizes (either 32 bits or 64
+    // bits) for row_map_type and entries_type
+    if (std::is_same<idx_type, int>::value) {
+      if (!std::is_same<size_type, int>::value) {
+        sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0));
+        rm = (void*)sptrsv_handle->get_int_rowmap_ptr_copy(row_map);
+      } else {
+        rm = (void*)row_map.data();
+      }
+    } else {  // idx_type has 64 bits
+      if (!std::is_same<size_type, int64_t>::value) {
+        sptrsv_handle->allocate_tmp_int64_rowmap(row_map.extent(0));
+        rm = (void*)sptrsv_handle->get_int64_rowmap_ptr_copy(row_map);
+      } else {
+        rm = (void*)row_map.data();
+      }
+    }
+    const scalar_type alpha = scalar_type(1.0);
+
+    cusparseIndexType_t cudaCsrRowMapType =
+        cusparse_index_type_t_from<idx_type>();
+    cusparseIndexType_t cudaCsrColIndType =
+        cusparse_index_type_t_from<idx_type>();
+    cudaDataType cudaValueType = cuda_data_type_from<scalar_type>();
+
+    // Create sparse matrix in CSR format
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+        &(h->matDescr), static_cast<int64_t>(nrows),
+        static_cast<int64_t>(nrows), nnz, rm, (void*)entries.data(),
+        (void*)values.data(), cudaCsrRowMapType, cudaCsrColIndType,
+        CUSPARSE_INDEX_BASE_ZERO, cudaValueType));
+
+    // Create dummy dense vector B (RHS)
+    nnz_scalar_view_t b_dummy("b_dummy", nrows);
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseCreateDnVec(&(h->vecBDescr_dummy), static_cast<int64_t>(nrows),
+                            b_dummy.data(), cudaValueType));
+
+    // Create dummy dense vector X (LHS)
+    nnz_scalar_view_t x_dummy("x_dummy", nrows);
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseCreateDnVec(&(h->vecXDescr_dummy), static_cast<int64_t>(nrows),
+                            x_dummy.data(), cudaValueType));
+
+    // Specify Lower|Upper fill mode
+    if (is_lower) {
+      cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_LOWER;
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMatSetAttribute(
+          h->matDescr, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode)));
+    } else {
+      cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_UPPER;
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMatSetAttribute(
+          h->matDescr, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode)));
+    }
+
+    // Specify Unit|Non-Unit diagonal type.
+    cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMatSetAttribute(
+        h->matDescr, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype)));
+
+    // Allocate an external buffer for analysis
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_bufferSize(
+        h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr_dummy,
+        h->vecXDescr_dummy, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT,
+        h->spsvDescr, &pBufferSize));
+
+    // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void**)&(h->pBuffer), pBufferSize));
+
+    // Run analysis
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_analysis(
+        h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr_dummy,
+        h->vecXDescr_dummy, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT,
+        h->spsvDescr, h->pBuffer));
+  }
+#else  // CUDA_VERSION < 11030
   typedef typename KernelHandle::nnz_lno_t idx_type;
   typedef typename KernelHandle::size_type size_type;
   typedef typename KernelHandle::scalar_t scalar_type;
@@ -137,7 +246,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle,
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "analysis status error name " << (status) << std::endl;
-    } else if (std::is_same<scalar_type, Kokkos::complex<double>>::value) {
+    } else if (std::is_same<scalar_type, Kokkos::complex<double> >::value) {
       cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr,
                                  (cuDoubleComplex*)vals, (int*)rm, (int*)ent,
                                  h->info, &pBufferSize);
@@ -156,7 +265,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle,
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "analysis status error name " << (status) << std::endl;
-    } else if (std::is_same<scalar_type, Kokkos::complex<float>>::value) {
+    } else if (std::is_same<scalar_type, Kokkos::complex<float> >::value) {
       cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr,
                                  (cuComplex*)vals, (int*)rm, (int*)ent, h->info,
                                  &pBufferSize);
@@ -182,6 +291,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle,
     throw std::runtime_error(
         "CUSPARSE requires local ordinals to be integer.\n");
   }
+#endif
 #else
   (void)sptrsv_handle;
   (void)nrows;
@@ -207,6 +317,52 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle,
                           x_values_scalar_view_type lhs, bool /*trans*/
 ) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if (CUDA_VERSION >= 11030)
+  typedef typename KernelHandle::nnz_lno_t idx_type;
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::scalar_t scalar_type;
+  typedef typename KernelHandle::memory_space memory_space;
+
+  const bool is_cuda_space =
+      std::is_same<memory_space, Kokkos::CudaSpace>::value ||
+      std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
+      std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value;
+
+  const bool is_idx_type_supported = std::is_same<idx_type, int>::value ||
+                                     std::is_same<idx_type, int64_t>::value;
+
+  if (!is_cuda_space) {
+    throw std::runtime_error(
+        "KokkosKernels sptrsvcuSPARSE_solve: MEMORY IS NOT ALLOCATED IN GPU "
+        "DEVICE for CUSPARSE\n");
+  } else if (!is_idx_type_supported) {
+    throw std::runtime_error(
+        "CUSPARSE requires local ordinals to be integer (32 bits or 64 "
+        "bits).\n");
+  } else {
+    typename KernelHandle::SPTRSVcuSparseHandleType* h =
+        sptrsv_handle->get_cuSparseHandle();
+
+    const scalar_type alpha = scalar_type(1.0);
+
+    cudaDataType cudaValueType = cuda_data_type_from<scalar_type>();
+
+    // Create dense vector B (RHS)
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseCreateDnVec(&(h->vecBDescr), static_cast<int64_t>(nrows),
+                            (void*)rhs.data(), cudaValueType));
+
+    // Create dense vector X (LHS)
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseCreateDnVec(&(h->vecXDescr), static_cast<int64_t>(nrows),
+                            (void*)lhs.data(), cudaValueType));
+
+    // Solve
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve(
+        h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr,
+        h->vecXDescr, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT, h->spsvDescr));
+  }
+#else  // CUDA_VERSION < 11030
   typedef typename KernelHandle::nnz_lno_t idx_type;
   typedef typename KernelHandle::size_type size_type;
   typedef typename KernelHandle::scalar_t scalar_type;
@@ -253,7 +409,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle,
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "solve status error name " << (status) << std::endl;
-    } else if (std::is_same<scalar_type, Kokkos::complex<double>>::value) {
+    } else if (std::is_same<scalar_type, Kokkos::complex<double> >::value) {
       cuDoubleComplex cualpha;
       cualpha.x = 1.0;
       cualpha.y = 0.0;
@@ -264,7 +420,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle,
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "solve status error name " << (status) << std::endl;
-    } else if (std::is_same<scalar_type, Kokkos::complex<float>>::value) {
+    } else if (std::is_same<scalar_type, Kokkos::complex<float> >::value) {
       cuComplex cualpha;
       cualpha.x = 1.0;
       cualpha.y = 0.0;
@@ -283,6 +439,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle,
     throw std::runtime_error(
         "CUSPARSE requires local ordinals to be integer.\n");
   }
+#endif
 #else
   (void)sptrsv_handle;
   (void)nrows;

From b13f9586cdee498163db5de6783ea5f97ed08d0d Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Thu, 17 Nov 2022 11:41:52 -0700
Subject: [PATCH 05/11] Merge pull request #1590 from
 vqd8a/move-destroy-dense-vec-desc-out-of-cusparsehandle

Move destroying dense vector descriptors out of cuSparse sptrsv handle

(cherry picked from commit 1d0b48c86596250c2854bbac03e7e09496345b28)
---
 src/sparse/KokkosSparse_sptrsv_handle.hpp             | 4 ----
 src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 8 ++++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp
index 7933d11a8c..a5aacca361 100644
--- a/src/sparse/KokkosSparse_sptrsv_handle.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_handle.hpp
@@ -187,10 +187,6 @@ class SPTRSVHandle {
         pBuffer = nullptr;
       }
       KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(matDescr));
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecBDescr));
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecBDescr_dummy));
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecXDescr));
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecXDescr_dummy));
       KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_destroyDescr(spsvDescr));
       KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(handle));
     }
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
index 19af87b91e..a45d98eea9 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
@@ -167,6 +167,10 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle,
         h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr_dummy,
         h->vecXDescr_dummy, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT,
         h->spsvDescr, h->pBuffer));
+
+    // Destroy dummy dense vector descriptors
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecBDescr_dummy));
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecXDescr_dummy));
   }
 #else  // CUDA_VERSION < 11030
   typedef typename KernelHandle::nnz_lno_t idx_type;
@@ -361,6 +365,10 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle,
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve(
         h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr,
         h->vecXDescr, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT, h->spsvDescr));
+
+    // Destroy dense vector descriptors
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecBDescr));
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecXDescr));
   }
 #else  // CUDA_VERSION < 11030
   typedef typename KernelHandle::nnz_lno_t idx_type;

From 023898ae9892fff6aa18ab3fa5ee18367ceda7f3 Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Sat, 26 Nov 2022 14:25:27 -0700
Subject: [PATCH 06/11] Merge pull request #1604 from
 vqd8a/fix-typo-cuda-data-type-from

Fix cuda_data_type_from to return CUDA_C_64F for Kokkos::complex<double>

(cherry picked from commit 7d05d9d9d3372f8987da2ddbaa44f5164d475b68)
---
 src/sparse/KokkosSparse_Utils_cusparse.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sparse/KokkosSparse_Utils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp
index 6e9eee5ab5..cd1d64ee56 100644
--- a/src/sparse/KokkosSparse_Utils_cusparse.hpp
+++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp
@@ -151,7 +151,7 @@ inline cudaDataType cuda_data_type_from<Kokkos::complex<float>>() {
 }
 template <>
 inline cudaDataType cuda_data_type_from<Kokkos::complex<double>>() {
-  return CUDA_C_32F;
+  return CUDA_C_64F;
 }
 
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)

From 19ecafd07f64e4a940501a2e8867cc662478826b Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Mon, 28 Nov 2022 14:58:15 -0700
Subject: [PATCH 07/11] Merge pull request #1605 from
 vqd8a/disable-compile-check-scalar-type-cuSPARSE

Disable compile-time check in cuda_data_type_from on supported scalar types for cuSPARSE

(cherry picked from commit 7cb14fc94c1065d22ea9a68f7100222ab9e58ab9)
---
 src/sparse/KokkosSparse_Utils_cusparse.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/sparse/KokkosSparse_Utils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp
index cd1d64ee56..5ca7f40698 100644
--- a/src/sparse/KokkosSparse_Utils_cusparse.hpp
+++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp
@@ -116,9 +116,12 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus,
 
 template <typename T>
 cudaDataType cuda_data_type_from() {
+  // Note:  compile-time failure is disabled to allow for packages such as
+  // Ifpack2 to more easily support scalar types that cuSPARSE may not.
+
   // compile-time failure with a nice message if called on an unsupported type
-  static_assert(!std::is_same<T, T>::value,
-                "cuSparse TPL does not support scalar type");
+  // static_assert(!std::is_same<T, T>::value,
+  //               "cuSparse TPL does not support scalar type");
   // static_assert(false, ...) is allowed to error even if the code is not
   // instantiated. obfuscate the predicate Despite this function being
   // uncompilable, the compiler may decide that a return statement is missing,

From 9b3b0457aed0f9598ae11210e55186ccbac95785 Mon Sep 17 00:00:00 2001
From: Evan Harvey <57234914+e10harvey@users.noreply.github.com>
Date: Thu, 17 Nov 2022 11:00:27 -0700
Subject: [PATCH 08/11] Merge pull request #1588 from e10harvey/issue1547

batched/dense: Reduce register pressure

(cherry picked from commit a131d8bc421bc5ac68542c286a1d5d2df8f00bbc)
---
 src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 59 +++++++++++++++----
 1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
index 9e830c95d4..a9bc848789 100644
--- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
@@ -259,6 +259,42 @@ template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
           int tile_m, int tile_n, int tile_k>
 class BatchedDblBufGemm;
 
+//////////////////////////////// tile_m //////////////////////////////////
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() {
+  return 32;
+}
+//////////////////////////////// tile_n //////////////////////////////////
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() {
+  return 32;
+}
+//////////////////////////////// tile_k //////////////////////////////////
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() {
+  return 8;
+}
+
+// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails
+// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547.
+// This reduces the register allocations (REG_M and REG_N) in the double
+// buffering algorithm by a factor of 2.
+#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908)
+template <>
+constexpr KOKKOS_INLINE_FUNCTION int
+kk_gemm_dlb_buf_tile_k<Kokkos::Experimental::HIP>() {
+  return 16;
+}
+#endif
+////////////////////////// alpha_in_fma_thresh ////////////////////////////
+constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() {
+#ifdef __CUDACC_RDC__
+  return 24;
+#else
+  return 64;
+#endif  // __CUDAACC_RDC__
+}
+
 // clang-format off
 /// \brief Blocking solve of general matrix multiply on a batch of uniform matrices.
 ///
@@ -458,19 +494,19 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
   // Begin checking conditions for optimal BatchedGemm invocation.
   using view_scalar_type   = typename CViewType::value_type;
   using layout_type        = typename CViewType::array_layout;
+  using exec_space         = typename CViewType::execution_space;
   constexpr bool is_vector = KokkosBatched::is_vector<view_scalar_type>::value;
-  constexpr bool on_gpu    = KokkosKernels::Impl::kk_is_gpu_exec_space<
-      typename CViewType::execution_space>();
+  constexpr bool on_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>();
   constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space<
-      typename CViewType::execution_space::memory_space>();
+      typename exec_space::memory_space>();
   constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space<
-      typename CViewType::execution_space::memory_space>();
+      typename exec_space::memory_space>();
 
   if (handle->enableDebug) {
     std::cout << "view_scalar_type:" << typeid(view_scalar_type).name()
               << std::endl
-              << "execution_space:"
-              << typeid(typename CViewType::execution_space).name() << std::endl
+              << "execution_space:" << typeid(exec_space).name() << std::endl
               << std::endl
               << "is_vector:" << is_vector << std::endl
               << "on_gpu:" << on_gpu << std::endl
@@ -521,12 +557,11 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
                          ? (c_m >= 16)
                          : (c_m >= 24 && c_m <= 32) || c_m >= 40)) {
         handle->teamSz = handle->vecLen = 8;
-        constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
-#ifdef __CUDACC_RDC__
-        constexpr size_t alpha_in_fma_thresh = 24;
-#else
-        constexpr size_t alpha_in_fma_thresh = 64;
-#endif  // __CUDAACC_RDC__
+        constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m<exec_space>();
+        constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n<exec_space>();
+        constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k<exec_space>();
+        constexpr size_t alpha_in_fma_thresh =
+            Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh();
 
         if (c_m % 32 == 0) {                 // No bounds checking
           if (c_m >= alpha_in_fma_thresh) {  // apply alpha in fma

From 0a46443eebffce0569c7a762dbbd6b6df88ca154 Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Thu, 1 Dec 2022 12:56:28 -0700
Subject: [PATCH 09/11] Update to version 3.7.01

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40d6dd407b..9d39c2bef1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
   ENDIF()
   SET(KokkosKernels_VERSION_MAJOR 3)
   SET(KokkosKernels_VERSION_MINOR 7)
-  SET(KokkosKernels_VERSION_PATCH 00)
+  SET(KokkosKernels_VERSION_PATCH 01)
   SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}")
   MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}")
 ENDIF()

From 6cb632b6a2b0461e4e796f329c2a33f5a5f011d0 Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Thu, 1 Dec 2022 12:57:18 -0700
Subject: [PATCH 10/11] Update changelog

---
 CHANGELOG.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 46c4eeaf5f..3a788e353f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # Change Log
 
+## [3.7.01](https://github.com/kokkos/kokkos-kernels/tree/3.7.01) (2022-12-01)
+[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.00...3.7.01)
+
+### Bug Fixes:
+
+- Change template type for StaticCrsGraph in BsrMatrix [\#1531](https://github.com/kokkos/kokkos/pull/1531)
+- Remove listing of undefined TPL deps [\#1568](https://github.com/kokkos/kokkos/pull/1568)
+- Fix using SpGEMM with nonstandard scalar type, with MKL enabled [\#1591](https://github.com/kokkos/kokkos/pull/1591)
+- Move destroying dense vector descriptors out of cuSparse sptrsv handle [\#1590](https://github.com/kokkos/kokkos/pull/1590)
+- Fix `cuda_data_type_from` to return `CUDA_C_64F` for `Kokkos::complex<double>` [\#1604](https://github.com/kokkos/kokkos/pull/1604)
+- Disable compile-time check in cuda_data_type_from on supported scalar types for cuSPARSE [\#1605](https://github.com/kokkos/kokkos/pull/1605)
+- Reduce register pressure in batched dense algorithms [\#1588](https://github.com/kokkos/kokkos/pull/1588)
+
+### Implemented enhancements:
+
+- Use new cusparseSpSV TPL for SPTRSV when cuSPARSE is enabled with CUDA >= 11.3 [\#1574](https://github.com/kokkos/kokkos/pull/1574)
+
 ## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18)
 [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00)
 

From 146b9f89de027d7c8066925eb2400b723666626d Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Thu, 1 Dec 2022 13:01:24 -0700
Subject: [PATCH 11/11] update master_history

---
 master_history.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/master_history.txt b/master_history.txt
index 91399d7ba0..f2632e4fbb 100644
--- a/master_history.txt
+++ b/master_history.txt
@@ -18,3 +18,4 @@ tag: 3.5.00     date: 11/19/2021  master: 00189c0b    release: f171533d
 tag: 3.6.00     date: 04/06/2022  master: 8381db04    release: a7e683c4
 tag: 3.6.01     date: 05/23/2022  master: e09389ae    release: e1d8de42
 tag: 3.7.00     date: 08/25/2022  master: 42ab7a29    release: 9cc88ffa
+tag: 3.7.01     date: 12/01/2022  master: 04821ac3    release: 6cb632b6