diff --git a/CHANGELOG.md b/CHANGELOG.md index 46c4eeaf5f..3a788e353f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ # Change Log +## [3.7.01](https://github.com/kokkos/kokkos-kernels/tree/3.7.01) (2022-12-01) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.00...3.7.01) + +### Bug Fixes: + +- Change template type for StaticCrsGraph in BsrMatrix [\#1531](https://github.com/kokkos/kokkos/pull/1531) +- Remove listing of undefined TPL deps [\#1568](https://github.com/kokkos/kokkos/pull/1568) +- Fix using SpGEMM with nonstandard scalar type, with MKL enabled [\#1591](https://github.com/kokkos/kokkos/pull/1591) +- Move destroying dense vector descriptors out of cuSparse sptrsv handle [\#1590](https://github.com/kokkos/kokkos/pull/1590) +- Fix `cuda_data_type_from` to return `CUDA_C_64F` for `Kokkos::complex` [\#1604](https://github.com/kokkos/kokkos/pull/1604) +- Disable compile-time check in cuda_data_type_from on supported scalar types for cuSPARSE [\#1605](https://github.com/kokkos/kokkos/pull/1605) +- Reduce register pressure in batched dense algorithms [\#1588](https://github.com/kokkos/kokkos/pull/1588) + +### Implemented enhancements: + +- Use new cusparseSpSV TPL for SPTRSV when cuSPARSE is enabled with CUDA >= 11.3 [\#1574](https://github.com/kokkos/kokkos/pull/1574) + ## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00) diff --git a/CMakeLists.txt b/CMakeLists.txt index 40d6dd407b..9d39c2bef1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) SET(KokkosKernels_VERSION_MINOR 7) - SET(KokkosKernels_VERSION_PATCH 00) + SET(KokkosKernels_VERSION_PATCH 01) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") ENDIF() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index e8b1c6a5e2..4ce5a98dc0 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in diff --git a/master_history.txt b/master_history.txt index 91399d7ba0..f2632e4fbb 100644 --- a/master_history.txt +++ b/master_history.txt @@ -18,3 +18,4 @@ tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4 tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42 tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa +tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6 diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp index 9e830c95d4..a9bc848789 100644 --- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp @@ -259,6 +259,42 @@ template class BatchedDblBufGemm; +//////////////////////////////// tile_m ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() { + return 32; +} +//////////////////////////////// tile_n ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() { + return 32; +} +//////////////////////////////// tile_k ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() { + return 8; +} + +// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails +// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547. +// This reduces the register allocations (REG_M and REG_N) in the double +// buffering algorithm by a factor of 2. +#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) +template <> +constexpr KOKKOS_INLINE_FUNCTION int +kk_gemm_dlb_buf_tile_k() { + return 16; +} +#endif +////////////////////////// alpha_in_fma_thresh //////////////////////////// +constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { +#ifdef __CUDACC_RDC__ + return 24; +#else + return 64; +#endif // __CUDAACC_RDC__ +} + // clang-format off /// \brief Blocking solve of general matrix multiply on a batch of uniform matrices. /// @@ -458,19 +494,19 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, // Begin checking conditions for optimal BatchedGemm invocation. using view_scalar_type = typename CViewType::value_type; using layout_type = typename CViewType::array_layout; + using exec_space = typename CViewType::execution_space; constexpr bool is_vector = KokkosBatched::is_vector::value; - constexpr bool on_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space< - typename CViewType::execution_space>(); + constexpr bool on_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< - typename CViewType::execution_space::memory_space>(); + typename exec_space::memory_space>(); constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< - typename CViewType::execution_space::memory_space>(); + typename exec_space::memory_space>(); if (handle->enableDebug) { std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() << std::endl - << "execution_space:" - << typeid(typename CViewType::execution_space).name() << std::endl + << "execution_space:" << typeid(exec_space).name() << std::endl << std::endl << "is_vector:" << is_vector << std::endl << "on_gpu:" << on_gpu << std::endl @@ -521,12 +557,11 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, ? (c_m >= 16) : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = 32, tile_n = 32, tile_k = 8; -#ifdef __CUDACC_RDC__ - constexpr size_t alpha_in_fma_thresh = 24; -#else - constexpr size_t alpha_in_fma_thresh = 64; -#endif // __CUDAACC_RDC__ + constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); + constexpr size_t alpha_in_fma_thresh = + Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); if (c_m % 32 == 0) { // No bounds checking if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma diff --git a/src/sparse/KokkosSparse_BsrMatrix.hpp b/src/sparse/KokkosSparse_BsrMatrix.hpp index a615eff478..12f4dff651 100644 --- a/src/sparse/KokkosSparse_BsrMatrix.hpp +++ b/src/sparse/KokkosSparse_BsrMatrix.hpp @@ -390,12 +390,12 @@ class BsrMatrix { typedef BsrMatrix HostMirror; //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph + typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph + typedef Kokkos::StaticCrsGraph staticcrsgraph_type; //! Type of column indices in the sparse matrix. typedef typename staticcrsgraph_type::entries_type index_type; diff --git a/src/sparse/KokkosSparse_Utils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp index 6e9eee5ab5..5ca7f40698 100644 --- a/src/sparse/KokkosSparse_Utils_cusparse.hpp +++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp @@ -116,9 +116,12 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus, template cudaDataType cuda_data_type_from() { + // Note: compile-time failure is disabled to allow for packages such as + // Ifpack2 to more easily support scalar types that cuSPARSE may not. + // compile-time failure with a nice message if called on an unsupported type - static_assert(!std::is_same::value, - "cuSparse TPL does not support scalar type"); + // static_assert(!std::is_same::value, + // "cuSparse TPL does not support scalar type"); // static_assert(false, ...) is allowed to error even if the code is not // instantiated. obfuscate the predicate Despite this function being // uncompilable, the compiler may decide that a return statement is missing, @@ -151,7 +154,7 @@ inline cudaDataType cuda_data_type_from>() { } template <> inline cudaDataType cuda_data_type_from>() { - return CUDA_C_32F; + return CUDA_C_64F; } #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) diff --git a/src/sparse/KokkosSparse_Utils_mkl.hpp b/src/sparse/KokkosSparse_Utils_mkl.hpp index b9eb3a9bd2..3b1e28fd84 100644 --- a/src/sparse/KokkosSparse_Utils_mkl.hpp +++ b/src/sparse/KokkosSparse_Utils_mkl.hpp @@ -123,16 +123,16 @@ template class MKLSparseMatrix { sparse_matrix_t mtx; - static_assert(mkl_is_supported_value_type::value, - "Scalar type used in MKLSparseMatrix is NOT " - "supported by MKL"); - public: inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} // Constructs MKL sparse matrix from KK sparse views (m rows x n cols) inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols, - MKL_INT *xadj, MKL_INT *adj, value_type *values); + MKL_INT *xadj, MKL_INT *adj, value_type *values) { + throw std::runtime_error( + "Scalar type used in MKLSparseMatrix is NOT " + "supported by MKL"); + } // Allows using MKLSparseMatrix directly in MKL calls inline operator sparse_matrix_t() const { return mtx; } @@ -140,7 +140,11 @@ class MKLSparseMatrix { // Exports MKL sparse matrix contents into KK views inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start, MKL_INT *&columns, - value_type *&values); + value_type *&values) { + throw std::runtime_error( + "Scalar type used in MKLSparseMatrix is NOT " + "supported by MKL"); + } inline void destroy() { KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); @@ -256,4 +260,4 @@ inline void MKLSparseMatrix>::export_data( #endif // KOKKOSKERNELS_ENABLE_TPL_MKL -#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP \ No newline at end of file +#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp index 4c9c98d6c1..a5aacca361 100644 --- a/src/sparse/KokkosSparse_sptrsv_handle.hpp +++ b/src/sparse/KokkosSparse_sptrsv_handle.hpp @@ -50,7 +50,7 @@ #define KOKKOSSPARSE_SPTRSVHANDLE_HPP #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE -#include "cusparse.h" +#include "KokkosSparse_Utils_cusparse.hpp" #endif #if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \ @@ -108,6 +108,8 @@ class SPTRSVHandle { typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t; typedef typename Kokkos::View int_row_view_t; + typedef typename Kokkos::View + int64_row_view_t; // typedef typename row_lno_persistent_work_view_t::HostMirror // row_lno_persistent_work_host_view_t; //Host view type typedef typename Kokkos::View< @@ -154,6 +156,42 @@ class SPTRSVHandle { mtx_scalar_view_t; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if (CUDA_VERSION >= 11030) + struct cuSparseHandleType { + cusparseHandle_t handle; + cusparseOperation_t transpose; + cusparseSpMatDescr_t matDescr; + cusparseDnVecDescr_t vecBDescr, vecBDescr_dummy; + cusparseDnVecDescr_t vecXDescr, vecXDescr_dummy; + cusparseSpSVDescr_t spsvDescr; + void *pBuffer{nullptr}; + + cuSparseHandleType(bool transpose_, bool is_lower) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle)); + + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST)); + + if (transpose_) { + transpose = CUSPARSE_OPERATION_TRANSPOSE; + } else { + transpose = CUSPARSE_OPERATION_NON_TRANSPOSE; + } + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_createDescr(&spsvDescr)); + } + + ~cuSparseHandleType() { + if (pBuffer != nullptr) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(pBuffer)); + pBuffer = nullptr; + } + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(matDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_destroyDescr(spsvDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(handle)); + } + }; +#else // CUDA_VERSION < 11030 struct cuSparseHandleType { cusparseHandle_t handle; cusparseOperation_t transpose; @@ -202,6 +240,7 @@ class SPTRSVHandle { cusparseDestroy(handle); } }; +#endif typedef cuSparseHandleType SPTRSVcuSparseHandleType; #endif @@ -337,6 +376,7 @@ class SPTRSVHandle { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE SPTRSVcuSparseHandleType *cuSPARSEHandle; int_row_view_t tmp_int_rowmap; + int64_row_view_t tmp_int64_rowmap; #endif #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV @@ -443,7 +483,8 @@ class SPTRSVHandle { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE , cuSPARSEHandle(nullptr), - tmp_int_rowmap() + tmp_int_rowmap(), + tmp_int64_rowmap() #endif #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV , @@ -851,6 +892,18 @@ class SPTRSVHandle { } int_row_view_t get_int_rowmap_view() { return tmp_int_rowmap; } int *get_int_rowmap_ptr() { return tmp_int_rowmap.data(); } + + void allocate_tmp_int64_rowmap(size_type N) { + tmp_int64_rowmap = int64_row_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_int64_rowmap"), N); + } + template + int64_t *get_int64_rowmap_ptr_copy(const RowViewType &rowmap) { + Kokkos::deep_copy(tmp_int64_rowmap, rowmap); + Kokkos::fence(); + return tmp_int64_rowmap.data(); + } + int64_t *get_int64_rowmap_ptr() { return tmp_int64_rowmap.data(); } #endif bool algm_requires_symb_lvlsched() const { diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 61d0dc3ccf..a45d98eea9 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -45,9 +45,8 @@ #ifndef _KOKKOSSPTRSVCUSPARSE_HPP #define _KOKKOSSPTRSVCUSPARSE_HPP -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE -#include "cusparse.h" -#endif +#include "KokkosSparse_Utils_cusparse.hpp" + namespace KokkosSparse { namespace Impl { @@ -60,6 +59,120 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, ain_nonzero_index_view_type entries, ain_values_scalar_view_type values, bool trans) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if (CUDA_VERSION >= 11030) + typedef typename KernelHandle::nnz_lno_t idx_type; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::scalar_t scalar_type; + typedef typename KernelHandle::memory_space memory_space; + typedef typename KernelHandle::nnz_scalar_view_t nnz_scalar_view_t; + + const bool is_cuda_space = + std::is_same::value || + std::is_same::value || + std::is_same::value; + + const bool is_idx_type_supported = std::is_same::value || + std::is_same::value; + + if (!is_cuda_space) { + throw std::runtime_error( + "KokkosKernels sptrsvcuSPARSE_symbolic: MEMORY IS NOT ALLOCATED IN GPU " + "DEVICE for CUSPARSE\n"); + } else if (!is_idx_type_supported) { + throw std::runtime_error( + "CUSPARSE requires local ordinals to be integer (32 bits or 64 " + "bits).\n"); + } else { + bool is_lower = sptrsv_handle->is_lower_tri(); + sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); + + typename KernelHandle::SPTRSVcuSparseHandleType* h = + sptrsv_handle->get_cuSparseHandle(); + + int64_t nnz = static_cast(entries.extent(0)); + size_t pBufferSize; + void* rm; + // NOTE (Oct-29-2022): + // cusparseCreateCsr only supports the same sizes (either 32 bits or 64 + // bits) for row_map_type and entries_type + if (std::is_same::value) { + if (!std::is_same::value) { + sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); + rm = (void*)sptrsv_handle->get_int_rowmap_ptr_copy(row_map); + } else { + rm = (void*)row_map.data(); + } + } else { // idx_type has 64 bits + if (!std::is_same::value) { + sptrsv_handle->allocate_tmp_int64_rowmap(row_map.extent(0)); + rm = (void*)sptrsv_handle->get_int64_rowmap_ptr_copy(row_map); + } else { + rm = (void*)row_map.data(); + } + } + const scalar_type alpha = scalar_type(1.0); + + cusparseIndexType_t cudaCsrRowMapType = + cusparse_index_type_t_from(); + cusparseIndexType_t cudaCsrColIndType = + cusparse_index_type_t_from(); + cudaDataType cudaValueType = cuda_data_type_from(); + + // Create sparse matrix in CSR format + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &(h->matDescr), static_cast(nrows), + static_cast(nrows), nnz, rm, (void*)entries.data(), + (void*)values.data(), cudaCsrRowMapType, cudaCsrColIndType, + CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); + + // Create dummy dense vector B (RHS) + nnz_scalar_view_t b_dummy("b_dummy", nrows); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCreateDnVec(&(h->vecBDescr_dummy), static_cast(nrows), + b_dummy.data(), cudaValueType)); + + // Create dummy dense vector X (LHS) + nnz_scalar_view_t x_dummy("x_dummy", nrows); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCreateDnVec(&(h->vecXDescr_dummy), static_cast(nrows), + x_dummy.data(), cudaValueType)); + + // Specify Lower|Upper fill mode + if (is_lower) { + cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_LOWER; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMatSetAttribute( + h->matDescr, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode))); + } else { + cusparseFillMode_t fillmode = CUSPARSE_FILL_MODE_UPPER; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMatSetAttribute( + h->matDescr, CUSPARSE_SPMAT_FILL_MODE, &fillmode, sizeof(fillmode))); + } + + // Specify Unit|Non-Unit diagonal type. + cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMatSetAttribute( + h->matDescr, CUSPARSE_SPMAT_DIAG_TYPE, &diagtype, sizeof(diagtype))); + + // Allocate an external buffer for analysis + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_bufferSize( + h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr_dummy, + h->vecXDescr_dummy, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT, + h->spsvDescr, &pBufferSize)); + + // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void**)&(h->pBuffer), pBufferSize)); + + // Run analysis + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_analysis( + h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr_dummy, + h->vecXDescr_dummy, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT, + h->spsvDescr, h->pBuffer)); + + // Destroy dummy dense vector descriptors + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecBDescr_dummy)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecXDescr_dummy)); + } +#else // CUDA_VERSION < 11030 typedef typename KernelHandle::nnz_lno_t idx_type; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::scalar_t scalar_type; @@ -137,7 +250,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; - } else if (std::is_same>::value) { + } else if (std::is_same >::value) { cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, &pBufferSize); @@ -156,7 +269,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; - } else if (std::is_same>::value) { + } else if (std::is_same >::value) { cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex*)vals, (int*)rm, (int*)ent, h->info, &pBufferSize); @@ -182,6 +295,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, throw std::runtime_error( "CUSPARSE requires local ordinals to be integer.\n"); } +#endif #else (void)sptrsv_handle; (void)nrows; @@ -207,6 +321,56 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, x_values_scalar_view_type lhs, bool /*trans*/ ) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if (CUDA_VERSION >= 11030) + typedef typename KernelHandle::nnz_lno_t idx_type; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::scalar_t scalar_type; + typedef typename KernelHandle::memory_space memory_space; + + const bool is_cuda_space = + std::is_same::value || + std::is_same::value || + std::is_same::value; + + const bool is_idx_type_supported = std::is_same::value || + std::is_same::value; + + if (!is_cuda_space) { + throw std::runtime_error( + "KokkosKernels sptrsvcuSPARSE_solve: MEMORY IS NOT ALLOCATED IN GPU " + "DEVICE for CUSPARSE\n"); + } else if (!is_idx_type_supported) { + throw std::runtime_error( + "CUSPARSE requires local ordinals to be integer (32 bits or 64 " + "bits).\n"); + } else { + typename KernelHandle::SPTRSVcuSparseHandleType* h = + sptrsv_handle->get_cuSparseHandle(); + + const scalar_type alpha = scalar_type(1.0); + + cudaDataType cudaValueType = cuda_data_type_from(); + + // Create dense vector B (RHS) + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCreateDnVec(&(h->vecBDescr), static_cast(nrows), + (void*)rhs.data(), cudaValueType)); + + // Create dense vector X (LHS) + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCreateDnVec(&(h->vecXDescr), static_cast(nrows), + (void*)lhs.data(), cudaValueType)); + + // Solve + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve( + h->handle, h->transpose, &alpha, h->matDescr, h->vecBDescr, + h->vecXDescr, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT, h->spsvDescr)); + + // Destroy dense vector descriptors + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecBDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h->vecXDescr)); + } +#else // CUDA_VERSION < 11030 typedef typename KernelHandle::nnz_lno_t idx_type; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::scalar_t scalar_type; @@ -253,7 +417,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; - } else if (std::is_same>::value) { + } else if (std::is_same >::value) { cuDoubleComplex cualpha; cualpha.x = 1.0; cualpha.y = 0.0; @@ -264,7 +428,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; - } else if (std::is_same>::value) { + } else if (std::is_same >::value) { cuComplex cualpha; cualpha.x = 1.0; cualpha.y = 0.0; @@ -283,6 +447,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, throw std::runtime_error( "CUSPARSE requires local ordinals to be integer.\n"); } +#endif #else (void)sptrsv_handle; (void)nrows;