From 2f069c4fc64abea02ea1c24451f9752ae084c50f Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 2 Feb 2023 16:13:48 -0700 Subject: [PATCH 01/25] Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES (#1667) * Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES The cmake options KokkosKernels_ENABLE_PERFTESTS and KokkosKernels_ENABLE_EXAMPLES were not actually used, both perf_test/ and example/ were always built as long as KokkosKernels_ENABLE_ALL_COMPONENTS=ON. This makes these options have an effect again. If perftests or examples are enabled but ENABLE_ALL_COMPONENTS=OFF, print a message about why they can't actually be enabled. * From e10harvey: fix typo in perf_test cmake * Add feedback about cmake - Turn ENABLE_PERFTESTS off by default - since both examples and perf tests are off by default, warn if those are ON but can't be enabled because ENABLE_ALL_COMPONENTS=OFF - use ELSE to simplify logic where ENABLE_ALL_COMPONENTS=OFF (cherry picked from commit 834a85ecec46355915031a296725635db3599ba6) --- CMakeLists.txt | 23 +++++++++++++++++++---- perf_test/CMakeLists.txt | 2 +- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8166d846fe..b0df75bae6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,7 +70,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) ) KOKKOSKERNELS_ADD_OPTION( "ENABLE_PERFTESTS" - ON + OFF BOOL "Whether to build performance tests. Default: OFF" ) @@ -391,9 +391,24 @@ ELSE() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() IF (KokkosKernels_ENABLE_ALL_COMPONENTS) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) - KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) - ENDIF() + IF (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(STATUS "Enabling perf tests.") + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) + ENDIF () + IF (KokkosKernels_ENABLE_EXAMPLES) + MESSAGE(STATUS "Enabling examples.") + KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) + ENDIF () + ELSE () + # ENABLE_ALL_COMPONENTS is OFF, so perftests and examples can't be enabled. + # Warn if they were requested. + IF (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(WARNING "Could not enable perf tests because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + ENDIF () + IF (KokkosKernels_ENABLE_EXAMPLES) + MESSAGE(WARNING "Could not enable examples because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + ENDIF () + ENDIF () KOKKOSKERNELS_PACKAGE_POSTPROCESS() IF (KokkosKernels_ENABLE_DOCS) diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index bc638b64c1..28752e9c6c 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -28,7 +28,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) KOKKOSKERNELS_INCLUDE_DIRECTORIES(sparse) - if(Kokkos_ENABLE_TESTS_AND_PERFSUITE) + if(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) #Add RPS implementations of KK perf tests here KOKKOSKERNELS_ADD_EXECUTABLE( tracked_testing From b209a157c5a3043563d83ed7e848c4d48d7e9bc5 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 28 Feb 2023 12:50:50 -0700 Subject: [PATCH 02/25] Merge pull request #1691 from cwpearson/fix/cmake-force Introduce KOKKOSKERNELS_ALL_COMPONENTS_ENABLED variable (cherry picked from commit 76968d3f7dd408b25ae0a008da5ea35e0602829c) --- CMakeLists.txt | 8 ++++---- cmake/kokkoskernels_components.cmake | 27 +++++++++++++++------------ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b0df75bae6..5a70794076 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -390,7 +390,7 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() - IF (KokkosKernels_ENABLE_ALL_COMPONENTS) + IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) IF (KokkosKernels_ENABLE_PERFTESTS) MESSAGE(STATUS "Enabling perf tests.") KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) @@ -400,13 +400,13 @@ ELSE() KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) ENDIF () ELSE () - # ENABLE_ALL_COMPONENTS is OFF, so perftests and examples can't be enabled. + # all components were not enabled, so perftests and examples can't be enabled. # Warn if they were requested. IF (KokkosKernels_ENABLE_PERFTESTS) - MESSAGE(WARNING "Could not enable perf tests because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + MESSAGE(WARNING "Could not enable perf tests because not all components were enabled") ENDIF () IF (KokkosKernels_ENABLE_EXAMPLES) - MESSAGE(WARNING "Could not enable examples because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + MESSAGE(WARNING "Could not enable examples because not all components were enabled") ENDIF () ENDIF () diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index f33a62b6ff..56ab1a7c31 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -45,14 +45,6 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to build the graph component. Default: OFF" ) -# The user requested individual components, -# the assumption is that a full build is not -# desired and ENABLE_ALL_COMPONENETS is turned -# off. -IF (KokkosKernels_ENABLE_COMPONENT_BATCHED OR KokkosKernels_ENABLE_COMPONENT_BLAS - OR KokkosKernels_ENABLE_COMPONENT_GRAPH OR KokkosKernels_ENABLE_COMPONENT_SPARSE) - SET(KokkosKernels_ENABLE_ALL_COMPONENTS OFF CACHE BOOL "" FORCE) -ENDIF() # Graph depends on everything else because it depends # on Sparse at the moment, breaking that dependency will @@ -72,13 +64,24 @@ IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() -# At this point, if ENABLE_ALL_COMPONENTS is -# still ON we need to enable all individual -# components as they are required for this -# build. +# If user requested to enable all components, enable all components IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() + +# KOKKOSKERNELS_ALL_COMPONENTS_ENABLED says whether all components are on, +# regardless of how this came to be +# this is in the cache so we can use it as a global variable, +# but marking it as advanced should hide it from GUIs +IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED + AND KokkosKernels_ENABLE_COMPONENT_BLAS + AND KokkosKernels_ENABLE_COMPONENT_GRAPH + AND KokkosKernels_ENABLE_COMPONENT_SPARSE) + SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED ON CACHE BOOL "" FORCE) +ELSE() + SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) +ENDIF() +mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) \ No newline at end of file From 3831a680ab3540071aee5e1b2c50f35317761360 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 28 Feb 2023 12:50:58 -0700 Subject: [PATCH 03/25] Merge pull request #1707 from lucbv/kk_config_version Kokkos Kernels version: need to use upper case variables (cherry picked from commit d63de38b5a2db5f1febbea51f1b74b3082c32a2f) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a70794076..6eb2725ad8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,7 +221,7 @@ ELSE() # ================================================================== MESSAGE("") MESSAGE("================================") - MESSAGE("Kokkos Kernels version: ${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") + MESSAGE("Kokkos Kernels version: ${KOKKOSKERNELS_VERSION_MAJOR}.${KOKKOSKERNELS_VERSION_MINOR}.${KOKKOSKERNELS_VERSION_PATCH}") MESSAGE("================================") MESSAGE("Kokkos Kernels ETI Types") MESSAGE(" Devices: ${DEVICE_LIST}") From 1f266de0d3360439b66112f770262e4fc09069a5 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 3 Mar 2023 16:57:51 -0700 Subject: [PATCH 04/25] Merge pull request #1698 from cwpearson/fix/kk-1692 CUSPARSE_MM_ALG_DEFAULT deprecated by cuSparse 11.1 (cherry picked from commit 4f39a18ecc97733c639c685aacf7dc9eb6d5cb23) --- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 28a80ec266..717c62b985 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -151,7 +151,8 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, cusparseOperation_t opB = xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; -#if CUDA_VERSION < 12000 +// CUSPARSE_MM_ALG_DEFAULT was deprecated as early as 11.1 (maybe earlier) +#if CUSPARSE_VERSION < 11010 const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; #else const cusparseSpMMAlg_t alg = CUSPARSE_SPMM_ALG_DEFAULT; From 6cfc547ce84d611311a593b57bb9512f51d86ef7 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Thu, 2 Mar 2023 12:50:43 -0700 Subject: [PATCH 05/25] Merge pull request #1704 from e10harvey/doc_typos blas/blas1: Fix a couple documentation typos. (cherry picked from commit 3a206435045f44cbb7ca22c47d7083c27e79a3ff) --- blas/src/KokkosBlas1_mult.hpp | 14 ++++++++++++++ blas/src/KokkosBlas1_nrm2w.hpp | 2 +- blas/src/KokkosBlas1_reciprocal.hpp | 4 ++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 39ccbbeebd..e08409e9aa 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -23,6 +23,20 @@ namespace KokkosBlas { +/// \brief Element wise multiplication of two vectors: +/// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] +/// +/// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. +/// \tparam AV Type of the second vector A; a 1-D Kokkos::View. +/// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. +/// +/// \param gamma [in] The scalar to apply to Y. +/// \param Y [in/out] The Y vector. +/// \param alpha [in] The scalar to apply to A. +/// \param A [in] The vector to apply to X. +/// \param X [in] The X vector. +/// +/// \return Y = gamma * Y + alpha * A * X. template void mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) { diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index 5a7a07001c..403d8ba685 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -65,7 +65,7 @@ nrm2w(const XVector& x, const XVector& w) { /// \brief R(i,j) = nrm2w(X(i,j)) /// -/// Replace each entry in R with the nrm2wolute value (magnitude) of the +/// Replace each entry in R with the nrm2w, absolute value (magnitude), of the /// corresponding entry in X. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index 7e171cb6df..19624d11c9 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -25,8 +25,8 @@ namespace KokkosBlas { /// \brief R(i,j) = reciprocal(X(i,j)) /// -/// Replace each entry in R with the reciprocalolute value (magnitude) of the -/// corresponding entry in X. +/// Replace each entry in R with the absolute value (magnitude), of the +/// reciprocal of the corresponding entry in X. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have From db917b2f412bec35485bc0181e3fedd0e6331258 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 14 Mar 2023 11:40:42 -0600 Subject: [PATCH 06/25] Merge pull request #1727 from lucbv/cuda_11_4_fixes CUDA 11.4: fixing some failing build while trying to reproduce issue #1725 (cherry picked from commit 26332eda6d2058f7bf2a5facf6f0ceac0a01eea5) --- sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 4 ++++ sparse/src/KokkosSparse_sptrsv_handle.hpp | 2 +- sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7090c4f948..17611c3f2c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -299,6 +299,10 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, typedef typename KernelHandle::scalar_t scalar_type; typedef typename KernelHandle::memory_space memory_space; + (void)row_map; + (void)entries; + (void)values; + const bool is_cuda_space = std::is_same::value || std::is_same::value || diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index 049ab14b1e..7c9027d24a 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -138,7 +138,7 @@ class SPTRSVHandle { cusparseSpSVDescr_t spsvDescr; void *pBuffer{nullptr}; - cuSparseHandleType(bool transpose_, bool is_lower) { + cuSparseHandleType(bool transpose_, bool /*is_lower*/) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle)); KOKKOS_CUSPARSE_SAFE_CALL( diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 50555d9815..3096278c21 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -45,7 +45,7 @@ template void spgemm_numeric_cusparse( - KernelHandle *handle, lno_t m, lno_t n, lno_t k, + KernelHandle *handle, lno_t /*m*/, lno_t /*n*/, lno_t /*k*/, const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA, const ConstValuesType &valuesA, const ConstRowMapType &row_mapB, const ConstEntriesType &entriesB, const ConstValuesType &valuesB, From 8e90d005f183d11771c4a1a62af978c05f03493b Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Sat, 18 Mar 2023 11:49:20 -0600 Subject: [PATCH 07/25] Remove unused variable (#1734) (cherry picked from commit 60881471b1cabd824ecf54675157fb9f74c97301) --- sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 3096278c21..5f555f926e 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -519,7 +519,6 @@ void spgemm_numeric_mkl( const_cast(colidxB.data()), const_cast(valuesB.data())); auto mklSpgemmHandle = handle->get_mkl_spgemm_handle(); - bool computedEntries = false; matrix_descr generalDescr; generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; generalDescr.mode = SPARSE_FILL_MODE_FULL; From 99654d8cf338a918168774205bcfc6d5d2c4488f Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Tue, 21 Mar 2023 07:16:24 -0600 Subject: [PATCH 08/25] Merge pull request #1737 from e10harvey/reduce_test_coverage Reduce BatchedGemm test coverage (cherry picked from commit aec946c285f415d9cc6cedfbfe509705057e5c41) --- .../unit_test/Test_Batched_BatchedGemm.hpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 746be07eb7..c4e09d6e68 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -257,11 +257,15 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), algo_type); - if (algo_type == BaseHeuristicAlgos::SQUARE || - algo_type == BaseTplAlgos::ARMPL || + if (algo_type == BaseTplAlgos::ARMPL || algo_type == BaseKokkosBatchedAlgos::KK_SERIAL || algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 || algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) { + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, + matCdim1, matCdim2, 1.5, 3.0); + } else if (algo_type == BaseHeuristicAlgos::SQUARE) { // Invoke 4 times to ensure we cover all paths for alpha and beta impl_test_batched_gemm_with_handle( @@ -316,13 +320,12 @@ template void test_batched_gemm_with_layout(int N) { // Square cases - for (int i = 0; i < 5; ++i) { + { + int i = 0; Test::impl_test_batched_gemm(N, i, i, i, i, i, i); - } - { - int i = 10; + i = 10; Test::impl_test_batched_gemm(N, i, i, i, i, i, i); @@ -336,7 +339,7 @@ void test_batched_gemm_with_layout(int N) { } // Non-square cases - for (int i = 0; i < 5; ++i) { + for (int i = 1; i < 5; ++i) { int dimM = 1 * i; int dimN = 2 * i; int dimK = 3 * i; From 48ca11b50cdd68ba86a679513795274a1a90b97b Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Sat, 18 Mar 2023 11:49:38 -0600 Subject: [PATCH 09/25] Fix kk_generate_diagonally_dominant_sparse_matrix hang (#1689) * Fix kk_generate_diagonally_dominant_sparse_matrix hang Use bandwidth to cap the max entries per row, so that the row-filling loop doesn't run forever looking for a column that isn't already present. * Diag-dominant matrix generator: error if bandwidth too small If bandwidth is too small for the requested nnz and row_size_variance, error out with a detailed message. (cherry picked from commit 664bfc4d3e2144feaa279eb7cf66bb6b5adaee05) --- sparse/src/KokkosSparse_IOUtils.hpp | 41 ++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index d957cf4949..77934b4f3e 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -117,16 +117,42 @@ void kk_diagonally_dominant_sparseMatrix_generate( OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd, ScalarType diagDominance = 10 * Kokkos::ArithTraits::one()) { - rowPtr = new SizeType[nrows + 1]; - + rowPtr = new SizeType[nrows + 1]; OrdinalType elements_per_row = nnz / nrows; + // Set a hard limit to the actual entries in any one row, so that the + // loop to find a column not already taken will terminate quickly. + OrdinalType max_elements_per_row = 0.7 * bandwidth; + OrdinalType requested_max_elements_per_row = + elements_per_row + 0.5 * row_size_variance; + if (requested_max_elements_per_row > max_elements_per_row) { + std::cerr + << "kk_diagonally_dominant_sparseMatrix_generate: given the bandwidth (" + << bandwidth << "),\n"; + std::cerr << " can insert a maximum of " << max_elements_per_row + << " entries per row (0.7*bandwidth).\n"; + std::cerr << " But given the requested average entries per row of " + << elements_per_row << " and variance of " << row_size_variance + << ",\n"; + std::cerr << " there should be up to " << requested_max_elements_per_row + << " entries per row.\n"; + std::cerr << " Increase the bandwidth, or decrease nnz and/or " + "row_size_variance.\n"; + throw std::invalid_argument( + "kk_diagonally_dominant_sparseMatrix_generate: requested too many " + "entries per row for the given bandwidth."); + } srand(13721); rowPtr[0] = 0; for (int row = 0; row < nrows; row++) { - int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; - if (varianz < 1) varianz = 1; - if (varianz > 0.75 * ncols) varianz = 0.75 * ncols; - rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz; + // variance is how many more (or less) entries this row has compared to the + // mean (elements_per_row). + OrdinalType variance = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; + OrdinalType entries_in_row = elements_per_row + variance; + // Always have at least one entry (for the diagonal) + if (entries_in_row < 1) entries_in_row = 1; + if (entries_in_row > max_elements_per_row) + entries_in_row = max_elements_per_row; + rowPtr[row + 1] = rowPtr[row] + entries_in_row; if (rowPtr[row + 1] <= rowPtr[row]) // This makes sure that there is rowPtr[row + 1] = rowPtr[row] + 1; // at least one nonzero in the row } @@ -141,6 +167,9 @@ void kk_diagonally_dominant_sparseMatrix_generate( for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) { while (true) { OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; + // When bandwidth would extend past the columns of the matrix, wrap + // the entry around to the other side. This means the final matrix can + // actually have structural bandwidth close to ncols. while (pos < 0) pos += ncols; while (pos >= ncols) pos -= ncols; From 22960845732c9ce49baadf8e333bf0f19d5c3596 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 28 Mar 2023 11:18:03 -0600 Subject: [PATCH 10/25] Patch Trilinos #11663 This was intended to be a temporary patch, but it will need to stay until 4.1. This means it has to be included in 4.0.1. --- sparse/src/KokkosSparse_spgemm_symbolic.hpp | 27 +++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/sparse/src/KokkosSparse_spgemm_symbolic.hpp b/sparse/src/KokkosSparse_spgemm_symbolic.hpp index ff37199699..486d999e41 100644 --- a/sparse/src/KokkosSparse_spgemm_symbolic.hpp +++ b/sparse/src/KokkosSparse_spgemm_symbolic.hpp @@ -140,15 +140,26 @@ void spgemm_symbolic(KernelHandle *handle, // Verify that graphs A and B are sorted. // This test is designed to be as efficient as possible, but still skip // it in a release build. + // + // Temporary fix for Trilinos issue #11655: Only perform this check if a TPL + // is to be called. The KokkosKernels (non-TPL) implementation does not + // actually require sorted indices yet. And Tpetra uses size_type = size_t, so + // it will (currently) not be calling a TPL path. #ifndef NDEBUG - if (!KokkosSparse::Impl::isCrsGraphSorted(const_a_r, const_a_l)) - throw std::runtime_error( - "KokkosSparse::spgemm_symbolic: entries of A are not sorted within " - "rows. May use KokkosSparse::sort_crs_matrix to sort it."); - if (!KokkosSparse::Impl::isCrsGraphSorted(const_b_r, const_b_l)) - throw std::runtime_error( - "KokkosSparse::spgemm_symbolic: entries of B are not sorted within " - "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + if constexpr (KokkosSparse::Impl::spgemm_symbolic_tpl_spec_avail< + const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_, Internal_blno_row_view_t_, + Internal_blno_nnz_view_t_, + Internal_clno_row_view_t_>::value) { + if (!KokkosSparse::Impl::isCrsGraphSorted(const_a_r, const_a_l)) + throw std::runtime_error( + "KokkosSparse::spgemm_symbolic: entries of A are not sorted within " + "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + if (!KokkosSparse::Impl::isCrsGraphSorted(const_b_r, const_b_l)) + throw std::runtime_error( + "KokkosSparse::spgemm_symbolic: entries of B are not sorted within " + "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + } #endif auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type(); From 8928788a4529d6d61faaa3294bef13349c57cc18 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 28 Mar 2023 10:48:23 -0600 Subject: [PATCH 11/25] Update version to 4.0.01 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6eb2725ad8..208049b3b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) SET(KokkosKernels_VERSION_MINOR 0) -SET(KokkosKernels_VERSION_PATCH 0) +SET(KokkosKernels_VERSION_PATCH 01) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file From cfd5928e2f53a09da810fa1e3e57986d1e71d366 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 28 Mar 2023 10:57:15 -0600 Subject: [PATCH 12/25] Update changelog --- CHANGELOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76b4117118..e0e9e78467 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Change Log +## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-28-03) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.0...4.0.01) + +### Bug Fixes: +- Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES [\#1667](https://github.com/kokkos/kokkos-kernels/pull/1667) +- Introduce KOKKOSKERNELS_ALL_COMPONENTS_ENABLED variable [\#1691](https://github.com/kokkos/kokkos-kernels/pull/1691) +- Kokkos Kernels version: need to use upper case variables [\#1707](https://github.com/kokkos/kokkos-kernels/pull/1707) +- GMRES: fixing some type issues related to memory space instantiation (partial) [\#1719](https://github.com/kokkos/kokkos-kernels/pull/1719) +- CUSPARSE_MM_ALG_DEFAULT deprecated by cuSparse 11.1 [\#1698](https://github.com/kokkos/kokkos-kernels/pull/1698) +- blas1: Fix a couple documentation typos [\#1704](https://github.com/kokkos/kokkos-kernels/pull/1704) +- CUDA 11.4: fixing some -Werror [\#1727](https://github.com/kokkos/kokkos-kernels/pull/1727) +- Remove unused variable in KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp [\#1734](https://github.com/kokkos/kokkos-kernels/pull/1734) +- ParIlut: create and destroy spgemm handle for each usage [\#1736](https://github.com/kokkos/kokkos-kernels/pull/1736) +- Reduce BatchedGemm test coverage time [\#1737](https://github.com/kokkos/kokkos-kernels/pull/1737) +- Fix kk_generate_diagonally_dominant_sparse_matrix hang [\#1689](https://github.com/kokkos/kokkos-kernels/pull/1689) +- Temporary spgemm workaround matching Trilinos 11663 [\#1757](https://github.com/kokkos/kokkos-kernels/pull/1757) + + ## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.0) From 98c72b5f49f8d7b0b3737c0f8191ed43aaaf3fa5 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 31 Mar 2023 15:20:06 -0600 Subject: [PATCH 13/25] Merge pull request #1759 from tmranse/tmranse/mdfInterface MDF: Minor changes to interface for ifpack2 impl (cherry picked from commit 30bd681ffb969cc1d905468269816ae054614240) --- .gitignore | 5 +++++ cm_generate_makefile.bash | 12 ++++++++---- sparse/src/KokkosSparse_mdf.hpp | 4 ++-- sparse/src/KokkosSparse_mdf_handle.hpp | 3 ++- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index fa032cb2cb..d64726e92e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,8 @@ .project *.o TAGS + +#Clangd indexing +compile_commands.json +.cache/ +.vscode/ \ No newline at end of file diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index b98abbbfc8..f66253a5f6 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -366,8 +366,7 @@ display_help_text() { echo "--disable-perftests: Do not build Kokkos Kernels performance tests" echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" - echo "--enable-perfsuite: build Kokkos Kernels performance tests with -RAJAPerf Suite" + echo "--export-compile-commands: export cmake compile_commands.json file" } @@ -381,6 +380,8 @@ KOKKOSKERNELS_DO_PERFTESTS=ON KOKKOSKERNELS_DO_PERFSUITE=OFF KOKKOSKERNELS_DO_EXAMPLES=ON +CMAKE_EXPORT_COMPILE_COMMANDS=OFF + #Build static libraries by default BUILD_SHARED_LIBRARIES=OFF @@ -539,6 +540,9 @@ do # This is the default KOKKOSKERNELS_DO_TESTS=ON ;; + --export-compile-commands) + CMAKE_EXPORT_COMPILE_COMMANDS=ON + ;; --enable-perfsuite) KOKKOSKERNELS_DO_PERFSUITE=ON ;; @@ -812,6 +816,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 90fa3beeef..672da5b4de 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -34,7 +34,7 @@ namespace KokkosSparse { namespace Experimental { template -void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { +void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; @@ -63,7 +63,7 @@ void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { } // mdf_symbolic template -void mdf_numeric(crs_matrix_type& A, MDF_handle& handle) { +void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; using values_type = typename crs_matrix_type::values_type::non_const_type; diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index 6f6f2658be..03fd660b95 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -62,7 +62,7 @@ struct MDF_handle { crs_matrix_type L, U; - MDF_handle(const crs_matrix_type A) + MDF_handle(const crs_matrix_type& A) : numRows(A.numRows()), permutation(col_ind_type("row permutation", A.numRows())), permutation_inv(col_ind_type("inverse row permutation", A.numRows())), @@ -89,6 +89,7 @@ struct MDF_handle { } col_ind_type get_permutation() { return permutation; } + col_ind_type get_permutation_inv() { return permutation_inv; } void sort_factors() { KokkosSparse::sort_crs_matrix(L); From 2214957055dea81762bcfe9ac8c1d0b46aa0b5e6 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 31 Mar 2023 20:02:30 -0600 Subject: [PATCH 14/25] Merge pull request #1763 from lucbv/roc_tpls_upgrade Roc tpls upgrade (cherry picked from commit e35ed210bded9c2ee7f9b9a8139d7c6bc9516d5c) --- cmake/Dependencies.cmake | 2 +- cmake/Modules/FindTPLROCBLAS.cmake | 48 ++++++---------------- cmake/Modules/FindTPLROCSPARSE.cmake | 44 ++++---------------- sparse/unit_test/Test_Sparse_rocsparse.hpp | 1 + 4 files changed, 22 insertions(+), 73 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 4ce5a98dc0..d3b393ddde 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index 0217e8cf2c..c0a9de3b50 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,37 +1,13 @@ -IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -ELSEIF (ROCBLAS_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) -ELSEIF (ROCBLAS_LIBRARY_DIRS) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES rocblas LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -ELSEIF (KokkosKernels_ROCBLAS_ROOT) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE - LIBRARIES - rocblas - LIBRARY_PATHS - ${KokkosKernels_ROCBLAS_ROOT}/lib - HEADERS - rocblas.h - HEADER_PATHS - ${KokkosKernels_ROCBLAS_ROOT}/include - ) -ELSEIF (DEFINED ENV{ROCM_PATH}) - MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") - SET(ROCBLAS_ROOT "$ENV{ROCM_PATH}/rocblas") - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE - LIBRARIES - rocblas - LIBRARY_PATHS - ${ROCBLAS_ROOT}/lib - HEADERS - rocblas.h - HEADER_PATHS - ${ROCBLAS_ROOT}/include - ) +# MPL: 12/29/2022: CMake regular way to find a package +FIND_PACKAGE(ROCBLAS) +if(TARGET roc::rocblas) +## MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention +## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) + SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas) + SET(TPL_IMPORTED_NAME roc::rocblas) +## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention +## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) + ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas) ELSE() - MESSAGE(ERROR "rocBLAS was not detected properly, please disable it or provide sufficient information at configure time.") - # Todo: figure out how to use the target defined during rocblas installation - # FIND_PACKAGE(ROCBLAS REQUIRED) - # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) - # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -ENDIF() \ No newline at end of file + MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") +ENDIF() diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index 52a0261b48..5f985ff3a8 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -1,37 +1,9 @@ -IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -ELSEIF (ROCSPARSE_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) -ELSEIF (ROCSPARSE_LIBRARY_DIRS) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES rocsparse LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -ELSEIF (KokkosKernels_ROCSPARSE_ROOT) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE - LIBRARIES - rocsparse - LIBRARY_PATHS - ${KokkosKernels_ROCSPARSE_ROOT}/lib - HEADERS - rocsparse.h - HEADER_PATHS - ${KokkosKernels_ROCSPARSE_ROOT}/include - ) -ELSEIF (DEFINED ENV{ROCM_PATH}) - MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") - SET(ROCSPARSE_ROOT "$ENV{ROCM_PATH}/rocsparse") - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE - LIBRARIES - rocsparse - LIBRARY_PATHS - ${ROCSPARSE_ROOT}/lib - HEADERS - rocsparse.h - HEADER_PATHS - ${ROCSPARSE_ROOT}/include - ) +# MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake +FIND_PACKAGE(ROCSPARSE) +if(TARGET roc::rocsparse) + SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) + SET(TPL_IMPORTED_NAME roc::rocsparse) + ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse) ELSE() - MESSAGE(ERROR "rocSPARSE was not detected properly, please disable it or provide sufficient information at configure time.") - # Todo: figure out how to use the target defined during rocsparse installation - # FIND_PACKAGE(ROCSPARSE REQUIRED) - # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) - # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -ENDIF() \ No newline at end of file + MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") +ENDIF() diff --git a/sparse/unit_test/Test_Sparse_rocsparse.hpp b/sparse/unit_test/Test_Sparse_rocsparse.hpp index f4c9e4e741..804c777daa 100644 --- a/sparse/unit_test/Test_Sparse_rocsparse.hpp +++ b/sparse/unit_test/Test_Sparse_rocsparse.hpp @@ -66,6 +66,7 @@ void test_rocsparse_safe_call() { void test_rocsparse_singleton() { KokkosKernels::Impl::RocsparseSingleton& s = KokkosKernels::Impl::RocsparseSingleton::singleton(); + (void)s; } TEST_F(TestCategory, sparse_rocsparse_version) { test_rocsparse_version(); } From 4df81e5d080f99fd7cc19e695933fe4b65dc9f40 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 3 Apr 2023 17:41:27 -0600 Subject: [PATCH 15/25] Fix #1758 (#1762) For BLAS routines producing a complex scalar result (like zdotc), prefer to get the result via a pointer argument, rather than as a direct return value. Directly returning a std::complex from an "extern C" function is technically not allowed and Clang warns about it. (cherry picked from commit 53599f47d849166c68a8cf3db7bcfb4d9d1c1498) --- CheckHostBlasReturnComplex.cmake | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index 30063b1cc3..b9528ce45a 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -20,8 +20,8 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) #define F77_BLAS_MANGLE${F77_BLAS_MANGLE} extern \"C\" { - std::complex F77_BLAS_MANGLE(zdotc,ZDOTC)( - const int* n, + void F77_BLAS_MANGLE(zdotc,ZDOTC)( + std::complex* result, const int* n, const std::complex x[], const int* incx, const std::complex y[], const int* incy); } @@ -35,13 +35,23 @@ int main() { TWO = std::complex(0.0,2.0); f[0] = ONE; f[1] = TWO; - std::complex ret - = F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC); + std::complex ret; + F77_BLAS_MANGLE(zdotc,ZDOTC)(&ret, &NUM, f, &INC, f, &INC); return (ret.real() == double(5.0) ? 0 : 1); } " ) - CHECK_CXX_SOURCE_RUNS("${SOURCE}" ${VARNAME}) +# Test whether the above program, which assumes BLAS can give back complex results +# via pointer arguments, compiles and runs correctly. +# If it does, assume that we don't need to get complex results as direct return values, +# which causes -Wreturn-type-c-linkage warnings. +CHECK_CXX_SOURCE_RUNS("${SOURCE}" KK_BLAS_RESULT_AS_POINTER_ARG) + +IF(${KK_BLAS_RESULT_AS_POINTER_ARG}) + SET(VARNAME OFF) +ELSE() + SET(VARNAME ON) +ENDIF() ENDFUNCTION() From 8077e640bab02f34d597e39890b02a3a7a4ddf86 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 4 Apr 2023 13:10:48 -0600 Subject: [PATCH 16/25] Update changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0e9e78467..f27c94fe06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Change Log -## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-28-03) +## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-03-04) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.0...4.0.01) ### Bug Fixes: @@ -16,6 +16,9 @@ - Reduce BatchedGemm test coverage time [\#1737](https://github.com/kokkos/kokkos-kernels/pull/1737) - Fix kk_generate_diagonally_dominant_sparse_matrix hang [\#1689](https://github.com/kokkos/kokkos-kernels/pull/1689) - Temporary spgemm workaround matching Trilinos 11663 [\#1757](https://github.com/kokkos/kokkos-kernels/pull/1757) +- MDF: Minor changes to interface for ifpack2 impl [\#1759](https://github.com/kokkos/kokkos-kernels/pull/1759) +- Rocm TPL support upgrade [\#1763](https://github.com/kokkos/kokkos-kernels/pull/1763) +- Fix BLAS cmake check for complex types [\#1762](https://github.com/kokkos/kokkos-kernels/pull/1762) ## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) From 49339eb3f8223bbf05861bb07f73bb888953247e Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 13 Feb 2023 11:04:07 -0700 Subject: [PATCH 17/25] Merge pull request #1661 from jgfouca/jgfouca/par_ilut_test Adds a better parilut test with gmres (cherry picked from commit 747bb9303cb978339f717c4c3bddfa05ce5a519c) --- blas/src/KokkosBlas3_trsm.hpp | 5 +- common/src/KokkosKernels_Error.hpp | 56 ++++++ .../KokkosSparse_par_ilut_numeric_impl.hpp | 6 + sparse/src/KokkosSparse_LUPrec.hpp | 135 +++++++++++++ sparse/src/KokkosSparse_MatrixPrec.hpp | 21 +-- sparse/src/KokkosSparse_Preconditioner.hpp | 6 +- sparse/src/KokkosSparse_par_ilut.hpp | 5 + sparse/unit_test/Test_Sparse_gmres.hpp | 11 +- sparse/unit_test/Test_Sparse_par_ilut.hpp | 178 +++++++++++++++++- 9 files changed, 391 insertions(+), 32 deletions(-) create mode 100644 sparse/src/KokkosSparse_LUPrec.hpp diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 8de879b1f3..2e8d2f4cfa 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -42,8 +42,9 @@ namespace KokkosBlas { /// "L" or "l" indicates matrix A lower part is stored, the /// other part is not referenced /// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose, -/// "C" or "c" for conjugate transpose. \param diag [in] "U" or "u" indicates -/// the diagonal of A is assumed to be unit +/// "C" or "c" for conjugate transpose. +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit // "N" or "n" indicated the diagonal of A is assumed to be // non-unit /// \param alpha [in] Input coefficient used for multiplication with B diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index e4e4981973..9ebb104378 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -53,4 +53,60 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, } // namespace Impl } // namespace KokkosKernels +/* + * Asserts and error checking macros/functions. + * + * KK_KERNEL** are for error checking within kokkos kernels. + * + * Any check with "assert" in the name is disabled for release builds + * + * For _MSG checks, the msg argument can contain '<<' if not a kernel check. + * + * This code is adapted from EKAT/src/ekat/ekat_assert.hpp + */ + +// Internal do not call directly +#define IMPL_THROW(condition, msg, exception_type) \ + do { \ + if (!(condition)) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition; \ + _ss_ << "\n" << msg; \ + throw exception_type(_ss_.str()); \ + } \ + } while (0) + +// SYCL cannot printf like the other backends quite yet +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if (!(condition)) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("KERNEL CHECK FAILED:\n %s\n %s\n", \ + #condition, msg); \ + Kokkos::abort(""); \ + } \ + } while (0) + +#ifndef NDEBUG +#define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_ASSERT_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::logic_error) +#define KK_KERNEL_ASSERT(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_ASSERT_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) +#else +#define KK_ASSERT(condition) ((void)(0)) +#define KK_ASSERT_MSG(condition, msg) ((void)(0)) +#define KK_KERNEL_ASSERT(condition) ((void)(0)) +#define KK_KERNEL_ASSERT_MSG(condition, msg) ((void)(0)) +#endif + +#define KK_REQUIRE(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_REQUIRE_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::logic_error) + +#define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) + +#define KK_ERROR_MSG(msg) KK_REQUIRE_MSG(false, msg) +#define KK_KERNEL_ERROR_MSG(msg) KK_KERNEL_REQUIRE_MSG(false, msg) + #endif // KOKKOSKERNELS_ERROR_HPP diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 481742aa83..aa8af73d69 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -329,11 +329,17 @@ struct IlutWrap { const auto out_val = lpu_col == col_idx ? lpu_val : r_val / diag; // store output entries if (row_idx >= col_idx) { + KK_KERNEL_ASSERT_MSG( + l_new_nnz < L_new_row_map(row_idx + 1), + "add_candidates: Overflowed L_new, is your A matrix sorted?"); L_new_entries(l_new_nnz) = col_idx; L_new_values(l_new_nnz) = row_idx == col_idx ? 1. : out_val; ++l_new_nnz; } if (row_idx <= col_idx) { + KK_KERNEL_ASSERT_MSG( + u_new_nnz < U_new_row_map(row_idx + 1), + "add_candidates: Overflowed U_new, is your A matrix sorted?"); U_new_entries(u_new_nnz) = col_idx; U_new_values(u_new_nnz) = out_val; ++u_new_nnz; diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp new file mode 100644 index 0000000000..fddb1f0d68 --- /dev/null +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -0,0 +1,135 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ************************************************************************ +//@HEADER + +/// @file KokkosSparse_LUPrec.hpp + +#ifndef KK_LU_PREC_HPP +#define KK_LU_PREC_HPP + +#include +#include +#include +#include + +namespace KokkosSparse { +namespace Experimental { + +/// \class LUPrec +/// \brief This class is for applying LU preconditioning. +/// It takes L and U and the apply method returns U^inv L^inv x +/// \tparam CRS the CRS type of L and U +/// +/// Preconditioner provides the following methods +/// - initialize() Does nothing; members initialized upon object construction. +/// - isInitialized() returns true +/// - compute() Does nothing; members initialized upon object construction. +/// - isComputed() returns true +/// +template +class LUPrec : public KokkosSparse::Experimental::Preconditioner { + public: + using ScalarType = typename std::remove_const::type; + using EXSP = typename CRS::execution_space; + using karith = typename Kokkos::ArithTraits; + using View1d = typename Kokkos::View; + + private: + // trsm takes host views + CRS _L, _U; + View1d _tmp, _tmp2; + mutable KernelHandle _khL; + mutable KernelHandle _khU; + + public: + //! Constructor: + template + LUPrec(const CRSArg &L, const CRSArg &U) + : _L(L), + _U(U), + _tmp("LUPrec::_tmp", L.numRows()), + _tmp2("LUPrec::_tmp", L.numRows()), + _khL(), + _khU() { + KK_REQUIRE_MSG(L.numRows() == U.numRows(), + "LUPrec: L.numRows() != U.numRows()"); + + _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), + true); + _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), + false); + } + + //! Destructor. + virtual ~LUPrec() { + _khL.destroy_sptrsv_handle(); + _khU.destroy_sptrsv_handle(); + } + + ///// \brief Apply the preconditioner to X, putting the result in Y. + ///// + ///// \tparam XViewType Input vector, as a 1-D Kokkos::View + ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View + ///// + ///// \param transM [in] Not used. + ///// \param alpha [in] Not used + ///// \param beta [in] Not used. + ///// + ///// It takes L and U and the stores U^inv L^inv X in Y + // + virtual void apply(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", + ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + // tmp = trsv(L, x); //Apply L^inv to x + // y = trsv(U, tmp); //Apply U^inv to tmp + + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], + "LUPrec::apply only supports 'N' for transM"); + + sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); + sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); + + sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, + _tmp2); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); + } + //@} + + //! Set this preconditioner's parameters. + void setParameters() {} + + void initialize() {} + + //! True if the preconditioner has been successfully initialized, else false. + bool isInitialized() const { return true; } + + void compute() {} + + //! True if the preconditioner has been successfully computed, else false. + bool isComputed() const { return true; } + + //! True if the preconditioner implements a transpose operator apply. + bool hasTransposeApply() const { return true; } +}; + +} // namespace Experimental +} // End namespace KokkosSparse + +#endif diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 952036b7c7..3ca9e020c7 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -32,13 +32,9 @@ namespace Experimental { /// already has a matrix representation of their /// preconditioner M. The class applies an /// SpMV with M as the preconditioning step. -/// \tparam ScalarType Type of the matrix's entries -/// \tparam Layout Kokkos layout of vectors X and Y to which -/// the preconditioner is applied -/// \tparam EXSP Execution space for the preconditioner apply -/// \tparam Ordinal Type of the matrix's indices; +/// \tparam CRS the type of compressed matrix /// -/// Preconditioner provides the following methods +/// MatrixPrec provides the following methods /// - initialize() Does nothing; Matrix initialized upon object construction. /// - isInitialized() returns true /// - compute() Does nothing; Matrix initialized upon object construction. @@ -47,10 +43,7 @@ namespace Experimental { template class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { private: - CRS A; - - bool isInitialized_ = true; - bool isComputed_ = true; + CRS _A; public: using ScalarType = typename std::remove_const::type; @@ -59,7 +52,7 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { //! Constructor: template - MatrixPrec(const CRSArg &mat) : A(mat) {} + MatrixPrec(const CRSArg &mat) : _A(mat) {} //! Destructor. virtual ~MatrixPrec() {} @@ -85,7 +78,7 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { - KokkosSparse::spmv(transM, alpha, A, X, beta, Y); + KokkosSparse::spmv(transM, alpha, _A, X, beta, Y); } //@} @@ -95,12 +88,12 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { void initialize() {} //! True if the preconditioner has been successfully initialized, else false. - bool isInitialized() const { return isInitialized_; } + bool isInitialized() const { return true; } void compute() {} //! True if the preconditioner has been successfully computed, else false. - bool isComputed() const { return isComputed_; } + bool isComputed() const { return true; } //! True if the preconditioner implements a transpose operator apply. bool hasTransposeApply() const { return true; } diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 1c32d5eeaa..9d62a960e9 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -27,11 +27,7 @@ namespace Experimental { /// \class Preconditioner /// \brief Interface for KokkosKernels preconditioners -/// \tparam ScalarType Type of the matrix's entries -/// \tparam Layout Kokkos layout of vectors X and Y to which -/// the preconditioner is applied -/// \tparam EXSP Execution space for the preconditioner apply -/// \tparam Ordinal Type of the matrix's indices; +/// \tparam CRS Type of the compressed matrix /// /// Preconditioner provides the following methods /// - initialize() performs all operations based on the graph of the diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index ca46976366..21371792c0 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -348,6 +348,11 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, KokkosKernels::Impl::throw_runtime_exception(os.str()); } + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(L_rowmap, L_entries), + "L is not sorted"); + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(U_rowmap, U_entries), + "U is not sorted"); + using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; diff --git a/sparse/unit_test/Test_Sparse_gmres.hpp b/sparse/unit_test/Test_Sparse_gmres.hpp index 6b1265dabd..1990087526 100644 --- a/sparse/unit_test/Test_Sparse_gmres.hpp +++ b/sparse/unit_test/Test_Sparse_gmres.hpp @@ -86,14 +86,13 @@ void run_test_gmres() { ViewVectorType Wj("Wj", n); // For checking residuals at end. ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); gmres_handle->set_verbose(verbose); // Test CGS2 { - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B, 1.0); - gmres(&kh, A, B, X); // Double check residuals at end of solve: @@ -137,12 +136,12 @@ void run_test_gmres() { gmres_handle->set_verbose(verbose); // Make precond - auto myPrec = new KokkosSparse::Experimental::MatrixPrec(A); + KokkosSparse::Experimental::MatrixPrec myPrec(A); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); - gmres(&kh, A, B, X, myPrec); + gmres(&kh, A, B, X, &myPrec); // Double check residuals at end of solve: float_t nrmB = KokkosBlas::nrm2(B); @@ -154,8 +153,6 @@ void run_test_gmres() { EXPECT_LT(endRes, gmres_handle->get_tol()); EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - - delete myPrec; } } diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 0f73b80f3f..377d8127ec 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -25,6 +25,9 @@ #include "KokkosBlas1_nrm2.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_par_ilut.hpp" +#include "KokkosSparse_gmres.hpp" +#include "KokkosSparse_LUPrec.hpp" +#include "KokkosSparse_SortCrs.hpp" #include @@ -35,6 +38,20 @@ using namespace KokkosKernels::Experimental; namespace Test { +namespace ParIlut { + +template +struct TolMeta { + static constexpr T value = 1e-8; +}; + +template <> +struct TolMeta { + static constexpr float value = 1e-5; // Lower tolerance for floats +}; + +} // namespace ParIlut + template std::vector> decompress_matrix( @@ -272,6 +289,148 @@ void run_test_par_ilut() { #endif } +template +void run_test_par_ilut_precond() { + // Test using par_ilut as a preconditioner + // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? + using exe_space = typename device::execution_space; + using mem_space = typename device::memory_space; + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using sp_matrix_type = + KokkosSparse::CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; + using float_t = typename Kokkos::ArithTraits::mag_type; + + // Create a diagonally dominant sparse matrix to test: + constexpr auto n = 5000; + constexpr auto m = 15; + constexpr auto tol = ParIlut::TolMeta::value; + constexpr auto numRows = n; + constexpr auto numCols = n; + constexpr auto diagDominance = 1; + constexpr bool verbose = false; + + typename sp_matrix_type::non_const_size_type nnz = 10 * numRows; + auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), + diagDominance); + + KokkosSparse::sort_crs_matrix(A); + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + gmres_handle->set_verbose(verbose); + using GMRESHandle = + typename std::remove_reference::type; + using ViewVectorType = typename GMRESHandle::nnz_value_view_t; + + kh.create_par_ilut_handle(numRows); + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Pull out views from CRS + auto row_map = A.graph.row_map; + auto entries = A.graph.entries; + auto values = A.values; + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", numRows + 1); + RowMapType U_row_map("U_row_map", numRows + 1); + + // Initial L/U approximations for A + par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); + + const size_type nnzL = par_ilut_handle->get_nnzL(); + const size_type nnzU = par_ilut_handle->get_nnzU(); + + EntriesType L_entries("L_entries", nnzL); + ValuesType L_values("L_values", nnzL); + EntriesType U_entries("U_entries", nnzU); + ValuesType U_values("U_values", nnzU); + + par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, +#ifdef KOKKOS_ENABLE_SERIAL + true /*deterministic*/ +#else + false /*cannot ask for determinism*/ +#endif + ); + + // Create CRSs + sp_matrix_type L("L", numRows, numCols, L_values.extent(0), L_values, + L_row_map, L_entries), + U("U", numRows, numCols, U_values.extent(0), U_values, U_row_map, + U_entries); + + // Set initial vectors: + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + int num_iters_plain(0), num_iters_precond(0); + + // Solve Ax = b + { + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_plain = gmres_handle->get_num_iters(); + + EXPECT_GT(num_iters_plain, 0); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } + + // Solve Ax = b with LU preconditioner. Currently only works + // when deterministic mode in par_ilut is on, which is only + // possible when Kokkos::Serial has been enabled. +#ifdef KOKKOS_ENABLE_SERIAL + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); + + // Make precond + KokkosSparse::Experimental::LUPrec myPrec(L, + U); + + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); + + gmres(&kh, A, B, X, &myPrec); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_precond = gmres_handle->get_num_iters(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + EXPECT_LT(num_iters_precond, num_iters_plain); + } +#else + EXPECT_EQ(num_iters_precond, 0); +#endif +} + } // namespace Test template (); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut(); \ +template +void test_par_ilut_precond() { + Test::run_test_par_ilut_precond(); +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_precond(); \ } #define NO_TEST_COMPLEX From 957298552bcfcf7c57ed5ffa9d051ad9180052cf Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Mar 2023 10:36:04 -0700 Subject: [PATCH 18/25] GMRES: fixing some type issues related to memory space instantiation Basically one wants to be very careful about only instantiating View or other object with an execution space only as it might generate a memory type mismatch down the road (cherry picked from commit 1ae5b7ddca97342733eb18284220dc58fa3b5c1a) Conflicts: sparse/src/KokkosSparse_MatrixPrec.hpp Resolved conflict with variable naming "A" vs "_A" in spmv call --- example/gmres/test_prec.cpp | 6 ++++-- sparse/src/KokkosSparse_MatrixPrec.hpp | 11 ++++++----- sparse/src/KokkosSparse_Preconditioner.hpp | 11 ++++++----- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index f4aca0d6f6..8d1ff74b87 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -27,9 +27,11 @@ int main(int argc, char* argv[]) { using OT = int; using EXSP = Kokkos::DefaultExecutionSpace; using MESP = typename EXSP::memory_space; - using CRS = KokkosSparse::CrsMatrix; + using CRS = + KokkosSparse::CrsMatrix, void, OT>; - using ViewVectorType = Kokkos::View; + using ViewVectorType = + Kokkos::View>; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 3ca9e020c7..3cef1b6315 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -48,6 +48,7 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; //! Constructor: @@ -73,11 +74,11 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { ///\cdot X\f$. ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { KokkosSparse::spmv(transM, alpha, _A, X, beta, Y); } //@} diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 9d62a960e9..8fdd9398b2 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -52,6 +52,7 @@ class Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; //! Constructor: @@ -76,11 +77,11 @@ class Preconditioner { ///\cdot X\f$. ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const = 0; + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const = 0; //@} //! Set this preconditioner's parameters. From 8233f7330988e27da5416fff899429107cc72fae Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 20 Mar 2023 12:12:35 -0600 Subject: [PATCH 19/25] ParIlut: create and destroy spgemm handle for each usage (#1736) * ParIlut: create and destroy spgemm handle for each usage This fixes memory errors on Cuda * Formatting --- .../impl/KokkosSparse_par_ilut_numeric_impl.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index aa8af73d69..cedc2dbd43 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -78,6 +78,11 @@ struct IlutWrap { const URowMapType& U_row_map, const UEntriesType& U_entries, const UValuesType& U_values, LURowMapType& LU_row_map, LUEntriesType& LU_entries, LUValuesType& LU_values) { + std::string myalg("SPGEMM_KK_MEMORY"); + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = + KokkosSparse::StringToSPGEMMAlgorithm(myalg); + kh.create_spgemm_handle(spgemm_algorithm); + const size_type nrows = ih.get_nrows(); KokkosSparse::Experimental::spgemm_symbolic( @@ -95,6 +100,8 @@ struct IlutWrap { // Need to sort LU CRS if on CUDA! sort_crs_matrix(LU_row_map, LU_entries, LU_values); + + kh.destroy_spgemm_handle(); } /** @@ -722,6 +729,8 @@ struct IlutWrap { RRowMapType& R_row_map, REntriesType& R_entries, RValuesType& R_values, LURowMapType& LU_row_map, LUEntriesType& LU_entries, LUValuesType& LU_values) { + scalar_t result; + multiply_matrices(kh, ih, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, LU_row_map, LU_entries, LU_values); @@ -737,8 +746,6 @@ struct IlutWrap { &kh, A_row_map, A_entries, A_values, 1., LU_row_map, LU_entries, LU_values, -1., R_row_map, R_entries, R_values); - scalar_t result; - auto policy = ih.get_default_team_policy(); Kokkos::parallel_reduce( @@ -858,11 +865,6 @@ struct IlutWrap { thandle.get_residual_norm_delta_stop(); const size_type max_iter = thandle.get_max_iter(); - std::string myalg("SPGEMM_KK_MEMORY"); - KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = - KokkosSparse::StringToSPGEMMAlgorithm(myalg); - kh.create_spgemm_handle(spgemm_algorithm); - kh.create_spadd_handle(true /*we expect inputs to be sorted*/); // @@ -975,7 +977,6 @@ struct IlutWrap { ++itr; } - kh.destroy_spgemm_handle(); kh.destroy_spadd_handle(); } // end ilut_numeric From 710a2396b0e1fc66b6c88eb0347403474589037b Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 4 Apr 2023 14:07:21 -0600 Subject: [PATCH 20/25] Jgfouca/remove par ilut limitations (#1755) * Remove deterministic from par_ilut precond test Now that spgemm memory errors have been fixed, it appears to work * Add verbose mode to par_ilut * Fixes for GPU * Fix end_rel_res type to work when scalar is complex * Turn off asynch fixed point on GPU * Reorganize par_ilut handle, group conceptually similar members * Refactor par_ilut deterministic setting Change to async_update and move it to the handle. compute_l_u_factors does not need to run in a serial exespace, that is way overkill. Simply turning async_updates off should allow for deterministic results. Now that we are iterating more than once, it looks like even the hardcoded fixture test works fine with async_updates on since the multiple iterations corrects any "bad" results. * Add comment for par_ilut_precond test settings * Fix ordering warning * par_ilut: add test for nrows=0 (cherry picked from commit aa96a83ad44d66c615cc18aec650f1b2a5f307c3) --- sparse/impl/KokkosSparse_gmres_impl.hpp | 1 + .../KokkosSparse_par_ilut_numeric_impl.hpp | 218 ++++++++---------- .../KokkosSparse_par_ilut_numeric_spec.hpp | 8 +- .../KokkosSparse_par_ilut_symbolic_impl.hpp | 6 + .../KokkosSparse_par_ilut_symbolic_spec.hpp | 1 - sparse/src/KokkosKernels_Handle.hpp | 13 +- sparse/src/KokkosSparse_par_ilut.hpp | 35 ++- sparse/src/KokkosSparse_par_ilut_handle.hpp | 110 ++++++--- sparse/unit_test/Test_Sparse_par_ilut.hpp | 131 +++++++---- 9 files changed, 300 insertions(+), 223 deletions(-) diff --git a/sparse/impl/KokkosSparse_gmres_impl.hpp b/sparse/impl/KokkosSparse_gmres_impl.hpp index aa90a70757..8c7231f90c 100644 --- a/sparse/impl/KokkosSparse_gmres_impl.hpp +++ b/sparse/impl/KokkosSparse_gmres_impl.hpp @@ -92,6 +92,7 @@ struct GmresWrap { std::cout << " ortho: " << ((ortho == GmresHandle::Ortho::CGS2) ? "CGS2" : "MGS") << std::endl; + std::cout << " precond: " << (precond ? "ON" : "OFF") << std::endl; } // Make tmp work views diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index cedc2dbd43..89dcd12c5b 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -433,64 +433,12 @@ struct IlutWrap { return Kokkos::make_pair(a_val - sum, ut_nnz); } - template - KOKKOS_FUNCTION static void compute_l_u_factors_impl( - const ARowMapType& A_row_map, const AEntriesType& A_entries, - const AValuesType& A_values, LRowMapType& L_row_map, - LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, - UEntriesType& U_entries, UValuesType& U_values, UtRowMapType& Ut_row_map, - UtEntriesType& Ut_entries, UtValuesType& Ut_values, MemberType& team) { - const auto row_idx = team.league_rank(); - - const auto l_row_nnz_begin = L_row_map(row_idx); - const auto l_row_nnz_end = L_row_map(row_idx + 1); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, l_row_nnz_begin, l_row_nnz_end - 1), - [&](const size_type l_nnz) { - const auto col_idx = L_entries(l_nnz); - const auto u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); - if (u_diag != 0.0) { - const auto new_val = - compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, - L_row_map, L_entries, L_values, Ut_row_map, - Ut_entries, Ut_values) - .first / - u_diag; - L_values(l_nnz) = new_val; - } - }); - - team.team_barrier(); - - const auto u_row_nnz_begin = U_row_map(row_idx); - const auto u_row_nnz_end = U_row_map(row_idx + 1); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, u_row_nnz_begin, u_row_nnz_end), - [&](const size_type u_nnz) { - const auto col_idx = U_entries(u_nnz); - const auto sum = compute_sum(row_idx, col_idx, A_row_map, A_entries, - A_values, L_row_map, L_entries, L_values, - Ut_row_map, Ut_entries, Ut_values); - const auto new_val = sum.first; - const auto ut_nnz = sum.second; - U_values(u_nnz) = new_val; - Ut_values(ut_nnz) = new_val; // ut_nnz is not guarateed to fail into - // range used exclusively by this team - }); - } - /** * Implements a single iteration/sweep of the fixed-point ILU algorithm. * The results of this function are non-deterministic due to concurrent - * reading and writing of Ut values. deterministic can be set to true to - * make this function determistic, but it will be run in Serial exe space - * if so. + * reading and writing of Ut values. async_update can be set to false to + * make this function determistic, but that could cause par_ilut + * to take longer (more iterations) to converge. */ template ; - using smember_type = typename spolicy_type::member_type; - - const size_type nrows = ih.get_nrows(); - spolicy_type policy(nrows, 1); - - auto A_row_map_h = Kokkos::create_mirror_view(A_row_map); - auto A_entries_h = Kokkos::create_mirror_view(A_entries); - auto A_values_h = Kokkos::create_mirror_view(A_values); - auto L_row_map_h = Kokkos::create_mirror_view(L_row_map); - auto L_entries_h = Kokkos::create_mirror_view(L_entries); - auto L_values_h = Kokkos::create_mirror_view(L_values); - auto U_row_map_h = Kokkos::create_mirror_view(U_row_map); - auto U_entries_h = Kokkos::create_mirror_view(U_entries); - auto U_values_h = Kokkos::create_mirror_view(U_values); - auto Ut_row_map_h = Kokkos::create_mirror_view(Ut_row_map); - auto Ut_entries_h = Kokkos::create_mirror_view(Ut_entries); - auto Ut_values_h = Kokkos::create_mirror_view(Ut_values); - - Kokkos::deep_copy(A_row_map_h, A_row_map); - Kokkos::deep_copy(A_entries_h, A_entries); - Kokkos::deep_copy(A_values_h, A_values); - Kokkos::deep_copy(L_row_map_h, L_row_map); - Kokkos::deep_copy(L_entries_h, L_entries); - Kokkos::deep_copy(L_values_h, L_values); - Kokkos::deep_copy(U_row_map_h, U_row_map); - Kokkos::deep_copy(U_entries_h, U_entries); - Kokkos::deep_copy(U_values_h, U_values); - Kokkos::deep_copy(Ut_row_map_h, Ut_row_map); - Kokkos::deep_copy(Ut_entries_h, Ut_entries); - Kokkos::deep_copy(Ut_values_h, Ut_values); - - Kokkos::parallel_for( - "compute_l_u_factors", policy, - KOKKOS_LAMBDA(const smember_type& team) { - compute_l_u_factors_impl( - A_row_map_h, A_entries_h, A_values_h, L_row_map_h, L_entries_h, - L_values_h, U_row_map_h, U_entries_h, U_values_h, Ut_row_map_h, - Ut_entries_h, Ut_values_h, team); - }); + UtValuesType& Ut_values, const bool async_update) { + const size_type nrows = ih.get_nrows(); + Kokkos::parallel_for( + "compute_l_u_factors", range_policy(0, nrows), + KOKKOS_LAMBDA(const size_type row_idx) { + const auto l_row_nnz_begin = L_row_map(row_idx); + const auto l_row_nnz_end = + L_row_map(row_idx + 1) - 1; // skip diagonal for L + + for (auto l_nnz = l_row_nnz_begin; l_nnz < l_row_nnz_end; ++l_nnz) { + const auto col_idx = L_entries(l_nnz); + const auto u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); + if (u_diag != 0.0) { + const auto new_val = + compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, + L_row_map, L_entries, L_values, Ut_row_map, + Ut_entries, Ut_values) + .first / + u_diag; + L_values(l_nnz) = new_val; + } + } - Kokkos::deep_copy(L_values, L_values_h); - Kokkos::deep_copy(U_values, U_values_h); - Kokkos::deep_copy(Ut_values, Ut_values_h); -#else - throw std::runtime_error( - "compute_l_u factors cannot be deterministic without Kokkos::Serial " - "available"); -#endif - } else { - const auto policy = ih.get_default_team_policy(); - - Kokkos::parallel_for( - "compute_l_u_factors", policy, - KOKKOS_LAMBDA(const member_type& team) { - compute_l_u_factors_impl(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, - U_values, Ut_row_map, Ut_entries, - Ut_values, team); - }); - } + const auto u_row_nnz_begin = U_row_map(row_idx); + const auto u_row_nnz_end = U_row_map(row_idx + 1); + + for (auto u_nnz = u_row_nnz_begin; u_nnz < u_row_nnz_end; ++u_nnz) { + const auto col_idx = U_entries(u_nnz); + const auto sum = compute_sum( + row_idx, col_idx, A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, Ut_row_map, Ut_entries, Ut_values); + const auto new_val = sum.first; + const auto ut_nnz = sum.second; + U_values(u_nnz) = new_val; + + // ut_nnz is not guarateed to fail into range used exclusively + // by this thread. Updating it here opens up potential race + // conditions that cause problems on GPU but usually causes + // faster convergence. + if (async_update) { + Ut_values(ut_nnz) = new_val; + } + } + }); } /** @@ -853,7 +781,8 @@ struct IlutWrap { const AValuesType& A_values, LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, UEntriesType& U_entries, - UValuesType& U_values, bool deterministic) { + UValuesType& U_values) { + // Get config settings from handle const size_type nrows = thandle.get_nrows(); const auto fill_in_limit = thandle.get_fill_in_limit(); const auto l_nnz_limit = @@ -865,6 +794,21 @@ struct IlutWrap { thandle.get_residual_norm_delta_stop(); const size_type max_iter = thandle.get_max_iter(); + const auto verbose = thandle.get_verbose(); + constexpr bool on_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + const auto async_update = !on_gpu && thandle.get_async_update(); + + if (verbose) { + std::cout << "Starting PARILUT with..." << std::endl; + std::cout << " num_rows: " << nrows << std::endl; + std::cout << " fill_in_limit: " << fill_in_limit << std::endl; + std::cout << " max_iter: " << max_iter << std::endl; + std::cout << " res_norm_delta_stop: " << residual_norm_delta_stop + << std::endl; + std::cout << " async_update: " << async_update << std::endl; + } + kh.create_spadd_handle(true /*we expect inputs to be sorted*/); // @@ -890,8 +834,8 @@ struct IlutWrap { auto V_copy = Kokkos::create_mirror_view(V_copy_d); size_type itr = 0; + scalar_t curr_residual = std::numeric_limits::max(); scalar_t prev_residual = std::numeric_limits::max(); - bool converged = false; // Set the initial L/U values for the initial approximation initialize_LU(thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, @@ -900,7 +844,8 @@ struct IlutWrap { // // main loop // - while (!converged && itr < max_iter) { + bool stop = nrows == 0; // Don't iterate at all if nrows=0 + while (!stop && itr < max_iter) { // LU = L*U if (prev_residual == std::numeric_limits::max()) { multiply_matrices(kh, thandle, L_row_map, L_entries, L_values, @@ -923,7 +868,7 @@ struct IlutWrap { compute_l_u_factors( thandle, A_row_map, A_entries, A_values, L_new_row_map, L_new_entries, L_new_values, U_new_row_map, U_new_entries, U_new_values, - Ut_new_row_map, Ut_new_entries, Ut_new_values, deterministic); + Ut_new_row_map, Ut_new_entries, Ut_new_values, async_update); // Filter smallest elements from L_new and U_new. Store result back // in L and U. @@ -957,18 +902,28 @@ struct IlutWrap { compute_l_u_factors(thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, Ut_new_row_map, Ut_new_entries, Ut_new_values, - deterministic); + async_update); - // Compute residual and terminate if converged + // Compute residual and check stop conditions { - const auto curr_residual = compute_residual_norm( + curr_residual = compute_residual_norm( kh, thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, R_row_map, R_entries, R_values, LU_row_map, LU_entries, LU_values); - if (karith::abs(prev_residual - curr_residual) <= - karith::abs(residual_norm_delta_stop)) { - converged = true; + if (verbose) { + std::cout << "Completed itr " << itr + << ", residual is: " << curr_residual << std::endl; + } + + const auto curr_delta = karith::abs(prev_residual - curr_residual); + if (curr_delta <= residual_norm_delta_stop) { + if (verbose) { + std::cout << " Itr-to-itr residual change has dropped below " + "residual_norm_delta_stop, stop" + << std::endl; + } + stop = true; } else { prev_residual = curr_residual; } @@ -977,6 +932,13 @@ struct IlutWrap { ++itr; } + curr_residual = nrows == 0 ? scalar_t(0.) : curr_residual; + if (verbose) { + std::cout << "PAR_ILUT stopped in " << itr << " iterations with residual " + << curr_residual << std::endl; + } + thandle.set_stats(itr, curr_residual); + kh.destroy_spadd_handle(); } // end ilut_numeric diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp index 72130a07ad..fd3bc2b8bb 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp @@ -114,8 +114,7 @@ struct PAR_ILUT_NUMERIC { const AValuesType &A_values, LRowMapType &L_row_map, LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map, - UEntriesType &U_entries, UValuesType &U_values, - bool deterministic = false); + UEntriesType &U_entries, UValuesType &U_values); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -135,15 +134,14 @@ struct PAR_ILUT_NUMERICget_par_ilut_handle(); using Ilut = Experimental::IlutWrap< typename std::remove_pointer::type>; Ilut::ilut_numeric(*handle, *par_ilut_handle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values, deterministic); + U_entries, U_values); } }; diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp index 573f550832..4e331d2f11 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp @@ -44,6 +44,10 @@ void ilut_symbolic(IlutHandle& thandle, const ARowMapType& A_row_map_d, using size_type = typename IlutHandle::size_type; using Ilut = IlutWrap; + const size_type a_nrows = A_row_map_d.extent(0); + const size_type nrows = a_nrows > 0 ? (a_nrows - 1) : 0; + thandle.set_nrows(nrows); + const auto policy = thandle.get_default_team_policy(); // Sizing for the initial L/U approximation @@ -83,8 +87,10 @@ void ilut_symbolic(IlutHandle& thandle, const ARowMapType& A_row_map_d, const size_type nnzsL = Ilut::prefix_sum(L_row_map_d); const size_type nnzsU = Ilut::prefix_sum(U_row_map_d); + // Set symbolic info on handle thandle.set_nnzL(nnzsL); thandle.set_nnzU(nnzsU); + thandle.set_symbolic_complete(); } // end ilut_symbolic diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp index a27a685a00..b822d12ab0 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp @@ -108,7 +108,6 @@ struct PAR_ILUT_SYMBOLICset_symbolic_complete(); } }; #endif diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 12f3614a07..9d4c10a3d8 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -871,12 +871,17 @@ class KokkosKernelsHandle { } PAR_ILUTHandleType *get_par_ilut_handle() { return this->par_ilutHandle; } - void create_par_ilut_handle(const size_type nrows, const size_type nnzL = 0, - const size_type nnzU = 0, - const size_type max_iter = 1) { + void create_par_ilut_handle( + const size_type max_iter = 20, + const typename PAR_ILUTHandleType::float_t residual_norm_delta_stop = + 1e-2, + const typename PAR_ILUTHandleType::float_t fill_in_limit = 0.75, + const bool async_update = true, const bool verbose = false) { this->destroy_par_ilut_handle(); this->is_owner_of_the_par_ilut_handle = true; - this->par_ilutHandle = new PAR_ILUTHandleType(nrows, nnzL, nnzU, max_iter); + this->par_ilutHandle = + new PAR_ILUTHandleType(max_iter, residual_norm_delta_stop, + fill_in_limit, async_update, verbose); this->par_ilutHandle->set_team_size(this->team_work_size); this->par_ilutHandle->set_vector_size(this->vector_size); } diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 21371792c0..c7def802b9 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -20,9 +20,13 @@ /// This file provides KokkosSparse::par_ilut. This function performs a /// local (no MPI) sparse ILU(t) on matrices stored in /// compressed row sparse ("Crs") format. It is expected that symbolic -/// is called before numeric. The numeric function offers a deterministic -/// flag that will force the function to have deterministic results. This -/// is useful for testing but incurs a big performance penalty. +/// is called before numeric. The handle offers an async_update +/// flag that controls whether asynchronous updates are allowed while computing +/// L U factors. This is useful for testing as it allows for repeatable +/// (deterministic) results but may cause the algorithm to take longer (more +/// iterations) to converge. The par_ilut algorithm will repeat (iterate) until +/// max_iters is hit or the improvement in the residual from iter to iter drops +/// below a certain threshold. /// /// This algorithm is described in the paper: /// PARILUT - A New Parallel Threshold ILU Factorization - Anzt, Chow, Dongarra @@ -114,6 +118,13 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, "par_ilut_symbolic: KernelHandle and Views have different execution " "spaces."); + if (A_rowmap.extent(0) != 0) { + KK_REQUIRE_MSG(A_rowmap.extent(0) == L_rowmap.extent(0), + "L row map size does not match A row map"); + KK_REQUIRE_MSG(A_rowmap.extent(0) == U_rowmap.extent(0), + "U row map size does not match A row map"); + } + using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; @@ -173,8 +184,7 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, - UEntriesType& U_entries, UValuesType& U_values, - bool deterministic) { + UEntriesType& U_entries, UValuesType& U_values) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; using scalar_type = typename KernelHandle::nnz_scalar_t; @@ -437,11 +447,16 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, KokkosSparse::Impl::PAR_ILUT_NUMERIC< const_handle_type, ARowMap_Internal, AEntries_Internal, AValues_Internal, LRowMap_Internal, LEntries_Internal, LValues_Internal, URowMap_Internal, - UEntries_Internal, - UValues_Internal>::par_ilut_numeric(&tmp_handle, A_rowmap_i, A_entries_i, - A_values_i, L_rowmap_i, L_entries_i, - L_values_i, U_rowmap_i, U_entries_i, - U_values_i, deterministic); + UEntries_Internal, UValues_Internal>::par_ilut_numeric(&tmp_handle, + A_rowmap_i, + A_entries_i, + A_values_i, + L_rowmap_i, + L_entries_i, + L_values_i, + U_rowmap_i, + U_entries_i, + U_values_i); // These may have been resized L_entries = L_entries_i; diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 04f546cdd6..7ae11d297c 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -24,6 +24,12 @@ namespace KokkosSparse { namespace Experimental { +/** + * Handle for par_ilut. Contains useful types, par_ilut configuration settings, + * symbolic settings and scalar output info. + * + * For more info, see KokkosSparse_par_ilut.hpp doxygen + */ template class PAR_ILUTHandle { @@ -66,41 +72,59 @@ class PAR_ILUTHandle { typename nnz_row_view_t::memory_traits>; private: - size_type nrows; - size_type nnzL; - size_type nnzU; - size_type max_iter; - nnz_scalar_t residual_norm_delta_stop; - - bool symbolic_complete; - - int team_size; - int vector_size; - - float_t fill_in_limit; + // User inputs + size_type max_iter; /// Hard cap on the number of par_ilut iterations + float_t residual_norm_delta_stop; /// When the change in residual from + /// iteration to iteration drops below + /// this, the algorithm will stop (even if + /// max_iters has not been hit) + float_t fill_in_limit; /// The threshold for the ILU factorization + bool async_update; /// Whether compute LU factors should do asychronous + /// updates. When ON, the algorithm will usually converge + /// faster but it makes the algorithm non-deterministic. + /// This will always be OFF for GPU since it doesn't work + /// there. + bool verbose; /// Print information while executing par_ilut + + // Stored by parent KokkosKernelsHandle + int team_size; /// Kokkos team size. Set by the parent handle. -1 implies + /// AUTO + int vector_size; /// Kokkos vector size. Set by the parent handle. + + // Stored by symbolic phase + size_type + nrows; /// Number of rows in the CSRs given to the symbolic par_ilut + size_type nnzL; /// Number of non-zero entries in the L part of A in the CSRs + /// given to the symbolic par_ilut + size_type nnzU; /// Number of non-zero entries in the U part of A in the CSRs + /// given to the symbolic par_ilut + bool symbolic_complete; /// Whether symbolic par_ilut has been called + + // Outputs + int num_iters; /// The number of iterations par_ilut took to finish + nnz_scalar_t end_rel_res; /// The A - LU residual norm at the time the + /// algorithm finished public: - PAR_ILUTHandle(const size_type nrows_, const size_type nnzL_ = 0, - const size_type nnzU_ = 0, const size_type max_iter_ = 1) - : nrows(nrows_), - nnzL(nnzL_), - nnzU(nnzU_), - max_iter(max_iter_), - residual_norm_delta_stop(0.), - symbolic_complete(false), + // See KokkosKernelsHandle::create_par_ilut_handle for default user input + // values + PAR_ILUTHandle(const size_type max_iter_, + const float_t residual_norm_delta_stop_, + const float_t fill_in_limit_, const bool async_update_, + const bool verbose_) + : max_iter(max_iter_), + residual_norm_delta_stop(residual_norm_delta_stop_), + fill_in_limit(fill_in_limit_), + async_update(async_update_), + verbose(verbose_), team_size(-1), vector_size(-1), - fill_in_limit(0.75) {} - - void reset_handle(const size_type nrows_, const size_type nnzL_, - const size_type nnzU_) { - set_nrows(nrows_); - set_nnzL(nnzL_); - set_nnzU(nnzU_); - set_residual_norm_delta_stop(0.); - reset_symbolic_complete(); - set_fill_in_limit(0.75); - } + nrows(0), + nnzL(0), + nnzU(0), + symbolic_complete(false), + num_iters(-1), + end_rel_res(-1) {} KOKKOS_INLINE_FUNCTION ~PAR_ILUTHandle() {} @@ -137,11 +161,10 @@ class PAR_ILUTHandle { void set_max_iter(const size_type max_iter_) { this->max_iter = max_iter_; } int get_max_iter() const { return this->max_iter; } - void set_residual_norm_delta_stop( - const nnz_scalar_t residual_norm_delta_stop_) { + void set_residual_norm_delta_stop(const float_t residual_norm_delta_stop_) { this->residual_norm_delta_stop = residual_norm_delta_stop_; } - nnz_scalar_t get_residual_norm_delta_stop() const { + float_t get_residual_norm_delta_stop() const { return this->residual_norm_delta_stop; } @@ -150,6 +173,16 @@ class PAR_ILUTHandle { } float_t get_fill_in_limit() const { return this->fill_in_limit; } + bool get_verbose() const { return verbose; } + + void set_verbose(const bool verbose_) { this->verbose = verbose_; } + + bool get_async_update() const { return async_update; } + + void set_async_update(const bool async_update_) { + this->async_update = async_update_; + } + TeamPolicy get_default_team_policy() const { if (team_size == -1) { return TeamPolicy(nrows, Kokkos::AUTO); @@ -157,6 +190,15 @@ class PAR_ILUTHandle { return TeamPolicy(nrows, team_size); } } + + int get_num_iters() const { return num_iters; } + + nnz_scalar_t get_end_rel_res() const { return end_rel_res; } + + void set_stats(int num_iters_, nnz_scalar_t end_rel_res_) { + num_iters = num_iters_; + end_rel_res = end_rel_res_; + } }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 377d8127ec..9b99c1000d 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -176,9 +176,10 @@ void run_test_par_ilut() { // Make kernel handle KernelHandle kh; - kh.create_par_ilut_handle(nrows); + kh.create_par_ilut_handle(); auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_async_update(false); // Allocate L and U CRS views as outputs RowMapType L_row_map("L_row_map", nrows + 1); @@ -190,8 +191,8 @@ void run_test_par_ilut() { const size_type nnzL = par_ilut_handle->get_nnzL(); const size_type nnzU = par_ilut_handle->get_nnzU(); - EXPECT_EQ(nnzL, 10); - EXPECT_EQ(nnzU, 8); + ASSERT_EQ(nnzL, 10); + ASSERT_EQ(nnzU, 8); EntriesType L_entries("L_entries", nnzL); ValuesType L_values("L_values", nnzL); @@ -199,13 +200,7 @@ void run_test_par_ilut() { ValuesType U_values("U_values", nnzU); par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, -#ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ -#else - false /*cannot ask for determinism*/ -#endif - ); + L_values, U_row_map, U_entries, U_values); // Use this to check LU // std::vector > expected_LU = { @@ -259,10 +254,6 @@ void run_test_par_ilut() { // check_matrix("U numeric", U_row_map, U_entries, U_values, // expected_U_candidates); - // Serial is required for deterministic mode and the checks below cannot - // reliably pass without determinism. -#ifdef KOKKOS_ENABLE_SERIAL - // Use these fixtures to test full numeric std::vector> expected_L_candidates = { {1., 0., 0., 0.}, @@ -283,10 +274,7 @@ void run_test_par_ilut() { check_matrix("U numeric", U_row_map, U_entries, U_values, expected_U_candidates); - // Checking - kh.destroy_par_ilut_handle(); -#endif } template ::mag_type; // Create a diagonally dominant sparse matrix to test: + // par_ilut settings max_iters, res_delta_stop, fill_in_limit, and + // async_update are all left as defaults constexpr auto n = 5000; constexpr auto m = 15; constexpr auto tol = ParIlut::TolMeta::value; @@ -330,8 +320,9 @@ void run_test_par_ilut_precond() { typename std::remove_reference::type; using ViewVectorType = typename GMRESHandle::nnz_value_view_t; - kh.create_par_ilut_handle(numRows); + kh.create_par_ilut_handle(); auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_verbose(verbose); // Pull out views from CRS auto row_map = A.graph.row_map; @@ -354,13 +345,7 @@ void run_test_par_ilut_precond() { ValuesType U_values("U_values", nnzU); par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, -#ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ -#else - false /*cannot ask for determinism*/ -#endif - ); + L_values, U_row_map, U_entries, U_values); // Create CRSs sp_matrix_type L("L", numRows, numCols, L_values.extent(0), L_values, @@ -396,10 +381,7 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); } - // Solve Ax = b with LU preconditioner. Currently only works - // when deterministic mode in par_ilut is on, which is only - // possible when Kokkos::Serial has been enabled. -#ifdef KOKKOS_ENABLE_SERIAL + // Solve Ax = b with LU preconditioner. { gmres_handle->reset_handle(m, tol); gmres_handle->set_verbose(verbose); @@ -426,9 +408,65 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); EXPECT_LT(num_iters_precond, num_iters_plain); } -#else - EXPECT_EQ(num_iters_precond, 0); -#endif +} + +template +void run_test_par_ilut_zerorow_A() { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space>; + + const size_type nrows = 0; + + // Allocate device CRS views for A + RowMapType row_map("row_map", 0); + EntriesType entries("entries", 0); + ValuesType values("values", 0); + + // Create host mirror views for CRS A + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + // Make kernel handle + KernelHandle kh; + + kh.create_par_ilut_handle(); + + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", nrows + 1); + RowMapType U_row_map("U_row_map", nrows + 1); + + // Initial L/U approximations for A + par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); + + const size_type nnzL = par_ilut_handle->get_nnzL(); + const size_type nnzU = par_ilut_handle->get_nnzU(); + + ASSERT_EQ(nnzL, 0); + ASSERT_EQ(nnzU, 0); + + EntriesType L_entries("L_entries", nnzL); + ValuesType L_values("L_values", nnzL); + EntriesType U_entries("U_entries", nnzU); + ValuesType U_values("U_values", nnzU); + + par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values); + + const auto itrs = par_ilut_handle->get_num_iters(); + const auto end_rel_res = par_ilut_handle->get_end_rel_res(); + + EXPECT_EQ(itrs, 0); + EXPECT_EQ(end_rel_res, scalar_t(0.)); + + kh.destroy_par_ilut_handle(); } } // namespace Test @@ -445,15 +483,26 @@ void test_par_ilut_precond() { Test::run_test_par_ilut_precond(); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut(); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut_precond(); \ +template +void test_par_ilut_zerorow_A() { + Test::run_test_par_ilut_zerorow_A(); +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_zerorow_A##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_zerorow_A(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_precond(); \ } #define NO_TEST_COMPLEX From 0dcbd6a17a6a039a9b7899c74ca7d58d48a8a30e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 14 Apr 2023 12:39:54 -0600 Subject: [PATCH 21/25] par_ilut: make Ut_values view atomic in compute_l_u_factors (#1781) * par_ilut: make Ut_values view atomic in compute_l_u_factors ... to fix the race issues when async updates are on. * With Ut atomic, no need to avoid async updates on GPU * Remove unnecessary header * Update comments * Fixes for complex scalars * Adjust async update views; default it to off * Fix UtValuesSafeType * Update sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp * Remove UtViewType in favor of std::conditional (cherry picked from commit 507c29f685920b2278c77dd45b3a9fe3f7260771) --- .../KokkosSparse_par_ilut_numeric_impl.hpp | 65 ++++++++++++++----- sparse/src/KokkosKernels_Handle.hpp | 2 +- sparse/src/KokkosSparse_par_ilut_handle.hpp | 2 - 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 89dcd12c5b..c482aff429 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -420,7 +420,8 @@ struct IlutWrap { const auto l_col = L_entries(l_row_nnz); const auto u_row = Ut_entries(ut_row_nnz); if (l_col == u_row && l_col < last_entry) { - sum += L_values(l_row_nnz) * Ut_values(ut_row_nnz); + const scalar_t ut_val = Ut_values(ut_row_nnz); + sum += L_values(l_row_nnz) * ut_val; } if (static_cast(u_row) == row_idx) { ut_nnz = ut_row_nnz; @@ -440,17 +441,32 @@ struct IlutWrap { * make this function determistic, but that could cause par_ilut * to take longer (more iterations) to converge. */ - template - static void compute_l_u_factors( + template + static void compute_l_u_factors_impl( IlutHandle& ih, const ARowMapType& A_row_map, const AEntriesType& A_entries, const AValuesType& A_values, LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, UEntriesType& U_entries, UValuesType& U_values, UtRowMapType& Ut_row_map, UtEntriesType& Ut_entries, - UtValuesType& Ut_values, const bool async_update) { + UtValuesType& Ut_values_arg) { + // UtValues needs to be Atomic if async updates are on. Otherwise, + // non-atomic is fine. + using UtValuesSafeType = std::conditional_t< + async_update, + Kokkos::View< + typename UtValuesType::non_const_value_type*, + typename UtValuesType::array_layout, + typename UtValuesType::device_type, + Kokkos::MemoryTraits >, + UtValuesType>; + + UtValuesSafeType Ut_values = Ut_values_arg; + const size_type nrows = ih.get_nrows(); Kokkos::parallel_for( "compute_l_u_factors", range_policy(0, nrows), @@ -460,8 +476,8 @@ struct IlutWrap { L_row_map(row_idx + 1) - 1; // skip diagonal for L for (auto l_nnz = l_row_nnz_begin; l_nnz < l_row_nnz_end; ++l_nnz) { - const auto col_idx = L_entries(l_nnz); - const auto u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); + const auto col_idx = L_entries(l_nnz); + const scalar_t u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); if (u_diag != 0.0) { const auto new_val = compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, @@ -487,8 +503,7 @@ struct IlutWrap { // ut_nnz is not guarateed to fail into range used exclusively // by this thread. Updating it here opens up potential race - // conditions that cause problems on GPU but usually causes - // faster convergence. + // conditions but usually causes faster convergence. if (async_update) { Ut_values(ut_nnz) = new_val; } @@ -496,6 +511,28 @@ struct IlutWrap { }); } + template + static void compute_l_u_factors( + IlutHandle& ih, const ARowMapType& A_row_map, + const AEntriesType& A_entries, const AValuesType& A_values, + LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, + URowMapType& U_row_map, UEntriesType& U_entries, UValuesType& U_values, + UtRowMapType& Ut_row_map, UtEntriesType& Ut_entries, + UtValuesType& Ut_values, const bool async_update) { + if (async_update) { + compute_l_u_factors_impl( + ih, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, Ut_row_map, Ut_entries, Ut_values); + } else { + compute_l_u_factors_impl( + ih, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, Ut_row_map, Ut_entries, Ut_values); + } + } + /** * Select threshold based on filter rank. Do all this on host */ @@ -794,10 +831,8 @@ struct IlutWrap { thandle.get_residual_norm_delta_stop(); const size_type max_iter = thandle.get_max_iter(); - const auto verbose = thandle.get_verbose(); - constexpr bool on_gpu = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - const auto async_update = !on_gpu && thandle.get_async_update(); + const auto verbose = thandle.get_verbose(); + const auto async_update = false; // thandle.get_async_update(); if (verbose) { std::cout << "Starting PARILUT with..." << std::endl; diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 9d4c10a3d8..598cf9577d 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -876,7 +876,7 @@ class KokkosKernelsHandle { const typename PAR_ILUTHandleType::float_t residual_norm_delta_stop = 1e-2, const typename PAR_ILUTHandleType::float_t fill_in_limit = 0.75, - const bool async_update = true, const bool verbose = false) { + const bool async_update = false, const bool verbose = false) { this->destroy_par_ilut_handle(); this->is_owner_of_the_par_ilut_handle = true; this->par_ilutHandle = diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 7ae11d297c..3ffe44ffca 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -82,8 +82,6 @@ class PAR_ILUTHandle { bool async_update; /// Whether compute LU factors should do asychronous /// updates. When ON, the algorithm will usually converge /// faster but it makes the algorithm non-deterministic. - /// This will always be OFF for GPU since it doesn't work - /// there. bool verbose; /// Print information while executing par_ilut // Stored by parent KokkosKernelsHandle From 82e13ca28df8d660acb8548d90f8e4fada043850 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 20 Apr 2023 12:23:15 -0600 Subject: [PATCH 22/25] Update changelog Add and reorder parilut entries Fix broken 4.0.0 changelog url --- CHANGELOG.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f27c94fe06..ac3b708fb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,28 +1,31 @@ # Change Log -## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-03-04) -[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.0...4.0.01) +## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-04-19) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.00...4.0.01) ### Bug Fixes: - Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES [\#1667](https://github.com/kokkos/kokkos-kernels/pull/1667) - Introduce KOKKOSKERNELS_ALL_COMPONENTS_ENABLED variable [\#1691](https://github.com/kokkos/kokkos-kernels/pull/1691) - Kokkos Kernels version: need to use upper case variables [\#1707](https://github.com/kokkos/kokkos-kernels/pull/1707) -- GMRES: fixing some type issues related to memory space instantiation (partial) [\#1719](https://github.com/kokkos/kokkos-kernels/pull/1719) - CUSPARSE_MM_ALG_DEFAULT deprecated by cuSparse 11.1 [\#1698](https://github.com/kokkos/kokkos-kernels/pull/1698) - blas1: Fix a couple documentation typos [\#1704](https://github.com/kokkos/kokkos-kernels/pull/1704) - CUDA 11.4: fixing some -Werror [\#1727](https://github.com/kokkos/kokkos-kernels/pull/1727) - Remove unused variable in KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp [\#1734](https://github.com/kokkos/kokkos-kernels/pull/1734) -- ParIlut: create and destroy spgemm handle for each usage [\#1736](https://github.com/kokkos/kokkos-kernels/pull/1736) - Reduce BatchedGemm test coverage time [\#1737](https://github.com/kokkos/kokkos-kernels/pull/1737) - Fix kk_generate_diagonally_dominant_sparse_matrix hang [\#1689](https://github.com/kokkos/kokkos-kernels/pull/1689) - Temporary spgemm workaround matching Trilinos 11663 [\#1757](https://github.com/kokkos/kokkos-kernels/pull/1757) - MDF: Minor changes to interface for ifpack2 impl [\#1759](https://github.com/kokkos/kokkos-kernels/pull/1759) - Rocm TPL support upgrade [\#1763](https://github.com/kokkos/kokkos-kernels/pull/1763) - Fix BLAS cmake check for complex types [\#1762](https://github.com/kokkos/kokkos-kernels/pull/1762) +- ParIlut: Adds a better parilut test with gmres [\#1661](https://github.com/kokkos/kokkos-kernels/pull/1661) +- GMRES: fixing some type issues related to memory space instantiation (partial) [\#1719](https://github.com/kokkos/kokkos-kernels/pull/1719) +- ParIlut: create and destroy spgemm handle for each usage [\#1736](https://github.com/kokkos/kokkos-kernels/pull/1736) +- ParIlut: remove par ilut limitations [\#1755](https://github.com/kokkos/kokkos-kernels/pull/1755) +- ParIlut: make Ut_values view atomic in compute_l_u_factors [\#1781](https://github.com/kokkos/kokkos-kernels/pull/1781) -## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) -[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.0) +## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.00) (2023-21-02) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.00) ### Features: - Copyright update 4.0 [\#1657](https://github.com/kokkos/kokkos-kernels/pull/1657) From a46ebd5e9a914a022688094d4bc829ffaa3be9fa Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 7 Mar 2023 11:02:44 -0700 Subject: [PATCH 23/25] Merge pull request #1719 from lucbv/gmres_type_fixes GMRES: fixing some type issues related to memory space instantiation (cherry picked from commit f41ff478c49e4c53f84128311c202709d3fa7861) --- sparse/src/KokkosSparse_LUPrec.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index fddb1f0d68..a257b8f09c 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -44,6 +44,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; using View1d = typename Kokkos::View; @@ -90,11 +91,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ///// ///// It takes L and U and the stores U^inv L^inv X in Y // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp From 8809e41cae6cd2e5113681f0f61f6cc11052005d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 26 Apr 2023 16:32:45 -0600 Subject: [PATCH 24/25] Update to version 4.0.1 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 208049b3b7..19c66c0f64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) SET(KokkosKernels_VERSION_MINOR 0) -SET(KokkosKernels_VERSION_PATCH 01) +SET(KokkosKernels_VERSION_PATCH 1) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file From c208dacaedfbf05291ebc509ab68274c03e6f72f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 25 Apr 2023 20:37:29 -0600 Subject: [PATCH 25/25] Update master_history.txt --- master_history.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/master_history.txt b/master_history.txt index a252fcb06e..6c9f253c07 100644 --- a/master_history.txt +++ b/master_history.txt @@ -19,4 +19,5 @@ tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4 tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42 tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6 -tag: 4.0.0 date: 02/32/2023 master: b4014bf2 release: a10dff20 +tag: 4.0.00 date: 02/23/2023 master: b4014bf2 release: a10dff20 +tag: 4.0.01 date: 04/26/2023 master: b9c1bab7 release: 8809e41c