From 1cc1e9cbb1d45ce4b2535dc2c743e051c5310e65 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 25 Apr 2022 08:06:40 -0600 Subject: [PATCH 01/11] Merge pull request #1390 from vqd8a/fix-spiluk-numeric-race-condition Improve spiluk numeric phase to avoid race conditions and processing in chunks (cherry picked from commit ca33f614218b5cfc46796fda82ead414bdc4daf0) --- src/sparse/KokkosSparse_spiluk_handle.hpp | 50 ++++-- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 88 +++++++--- .../KokkosSparse_spiluk_symbolic_impl.hpp | 163 +++++++++++++++--- 3 files changed, 243 insertions(+), 58 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 522e0461d5..3cabcd0f73 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -100,12 +100,17 @@ class SPILUKHandle { nnz_lno_view_t level_idx; // the list of rows in each level nnz_lno_view_t level_ptr; // the starting index (into the view level_idx) of each level + nnz_lno_view_t level_nchunks; // number of chunks of rows at each level + nnz_lno_view_t + level_nrowsperchunk; // maximum number of rows among chunks at each level size_type nrows; - size_type nlevel; + size_type nlevels; size_type nnzL; size_type nnzU; - size_type level_maxrows; // maximum number of rows of levels + size_type level_maxrows; // max. number of rows among levels + size_type + level_maxrowsperchunk; // max.number of rows among chunks among levels bool symbolic_complete; @@ -121,11 +126,14 @@ class SPILUKHandle { : level_list(), level_idx(), level_ptr(), + level_nchunks(), + level_nrowsperchunk(), nrows(nrows_), - nlevel(0), + nlevels(0), nnzL(nnzL_), nnzU(nnzU_), level_maxrows(0), + level_maxrowsperchunk(0), symbolic_complete(symbolic_complete_), algm(choice), team_size(-1), @@ -138,9 +146,11 @@ class SPILUKHandle { set_nnzL(nnzL_); set_nnzU(nnzU_); set_level_maxrows(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + set_level_maxrowsperchunk(0); + level_list = nnz_row_view_t("level_list", nrows_), + level_idx = nnz_lno_view_t("level_idx", nrows_), + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(), reset_symbolic_complete(); } @@ -159,6 +169,20 @@ class SPILUKHandle { KOKKOS_INLINE_FUNCTION nnz_lno_view_t get_level_ptr() const { return level_ptr; } + KOKKOS_INLINE_FUNCTION + nnz_lno_view_t get_level_nchunks() const { return level_nchunks; } + + void alloc_level_nchunks(const size_type nlevels_) { + level_nchunks = nnz_lno_view_t("level_nchunks", nlevels_); + } + + KOKKOS_INLINE_FUNCTION + nnz_lno_view_t get_level_nrowsperchunk() const { return level_nrowsperchunk; } + + void alloc_level_nrowsperchunk(const size_type nlevels_) { + level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_); + } + KOKKOS_INLINE_FUNCTION size_type get_nrows() const { return nrows; } @@ -185,10 +209,18 @@ class SPILUKHandle { this->level_maxrows = level_maxrows_; } + KOKKOS_INLINE_FUNCTION + size_type get_level_maxrowsperchunk() const { return level_maxrowsperchunk; } + + KOKKOS_INLINE_FUNCTION + void set_level_maxrowsperchunk(const size_type level_maxrowsperchunk_) { + this->level_maxrowsperchunk = level_maxrowsperchunk_; + } + bool is_symbolic_complete() const { return symbolic_complete; } - size_type get_num_levels() const { return nlevel; } - void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; } + size_type get_num_levels() const { return nlevels; } + void set_num_levels(size_type nlevels_) { this->nlevels = nlevels_; } void set_symbolic_complete() { this->symbolic_complete = true; } void reset_symbolic_complete() { this->symbolic_complete = false; } @@ -202,11 +234,9 @@ class SPILUKHandle { void print_algorithm() { if (algm == SPILUKAlgorithm::SEQLVLSCHD_RP) std::cout << "SEQLVLSCHD_RP" << std::endl; - ; if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; - ; /* if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) { diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 6a1300d747..d0b80ace69 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -323,9 +323,9 @@ struct ILUKLvlSchedTP1NumericFunctor { if (ipos != -1) { auto lxu = -U_values(kk) * fact; if (col < rowid) - L_values(ipos) += lxu; + Kokkos::atomic_add(&L_values(ipos), lxu); else - U_values(ipos) += lxu; + Kokkos::atomic_add(&U_values(ipos), lxu); } }); // end for kk @@ -383,28 +383,51 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; + using WorkViewType = + Kokkos::View>; + using LevelHostViewType = Kokkos::View; size_type nlevels = thandle.get_num_levels(); size_type nrows = thandle.get_nrows(); - // Keep this as host View, create device version and copy to back to host - HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + // Keep these as host View, create device version and copy back to host + HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + HandleDeviceEntriesType level_idx = thandle.get_level_idx(); + HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks(); + HandleDeviceEntriesType level_nrowsperchunk = + thandle.get_level_nrowsperchunk(); + // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM // space, a fence would be required before each access since UVM views can // share pages. - Kokkos::View level_ptr_h( + LevelHostViewType level_ptr_h, level_nchunks_h, level_nrowsperchunk_h; + WorkViewType iw; + + level_ptr_h = LevelHostViewType( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"), level_ptr.extent(0)); Kokkos::deep_copy(level_ptr_h, level_ptr); - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - - using WorkViewType = - Kokkos::View>; - - WorkViewType iw("iw", thandle.get_level_maxrows(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + level_nchunks_h = LevelHostViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), + level_nchunks.extent(0)); + level_nrowsperchunk_h = + LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "Host level nrowsperchunk"), + level_nrowsperchunk.extent(0)); + Kokkos::deep_copy(level_nchunks_h, level_nchunks); + Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk); + iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + thandle.get_level_maxrowsperchunk(), nrows); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); + } else { + iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + thandle.get_level_maxrows(), nrows); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); + } // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead @@ -429,20 +452,33 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using policy_type = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); - ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, level_idx, iw, lev_start); - if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", - policy_type(lev_end - lev_start, Kokkos::AUTO), - tstf); - else - Kokkos::parallel_for("parfor_l_team", - policy_type(lev_end - lev_start, team_size), - tstf); + nnz_lno_t lvl_rowid_start = 0; + nnz_lno_t lvl_nrows_chunk; + for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { + if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > + (lev_end - lev_start)) + lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + + ILUKLvlSchedTP1NumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + LValuesType, URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, level_idx, iw, + lev_start + lvl_rowid_start); + + if (team_size == -1) + Kokkos::parallel_for("parfor_l_team", + policy_type(lvl_nrows_chunk, Kokkos::AUTO), + tstf); + else + Kokkos::parallel_for("parfor_l_team", + policy_type(lvl_nrows_chunk, team_size), tstf); + + lvl_rowid_start += lvl_nrows_chunk; + } } // /* // // TP2 algorithm has issues with some offset-ordinal combo to be diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index ff464951c7..90bb88e057 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -63,12 +63,14 @@ namespace Experimental { template void level_sched(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, const size_type nrows, - LevelType1& level_list, LevelType2& level_ptr, - LevelType2& level_idx, size_type& nlevels) { + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + size_type& nlevels) { // Scheduling currently compute on host - typedef typename IlukHandle::nnz_lno_t nnz_lno_t; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + + size_type nrows = thandle.get_nrows(); nlevels = 0; level_ptr(0) = 0; @@ -117,6 +119,106 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrows(maxrows); } +// SEQLVLSCHD_TP1 algorithm (chunks) +template +void level_sched(IlukHandle& thandle, const RowMapType row_map, + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + LevelType3& level_nchunks, LevelType3& level_nrowsperchunk, + size_type& nlevels) { + // Scheduling currently compute on host + + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + + size_type nrows = thandle.get_nrows(); + + nlevels = 0; + level_ptr(0) = 0; + + for (size_type i = 0; i < nrows; ++i) { + size_type l = 0; + size_type rowstart = row_map(i); + size_type rowend = row_map(i + 1); + for (size_type j = rowstart; j < rowend; ++j) { + nnz_lno_t col = entries(j); + l = std::max(l, level_list(col)); + } + level_list(i) = l + 1; + level_ptr(l + 1) += 1; + nlevels = std::max(nlevels, l + 1); + } + + for (size_type i = 1; i <= nlevels; ++i) { + level_ptr(i) += level_ptr(i - 1); + } + + for (size_type i = 0; i < nrows; i++) { + level_idx(level_ptr(level_list(i) - 1)) = i; + level_ptr(level_list(i) - 1) += 1; + } + + if (nlevels > 0) { // note: to avoid wrapping around to the max of size_t + // when nlevels = 0. + for (size_type i = nlevels - 1; i > 0; --i) { + level_ptr(i) = level_ptr(i - 1); + } + } + + level_ptr(0) = 0; + + // Find max rows, number of chunks, max rows of chunks across levels + using HostViewType = + Kokkos::View; + + HostViewType lnchunks("lnchunks", nlevels); + HostViewType lnrowsperchunk("lnrowsperchunk", nlevels); + +#ifdef KOKKOS_ENABLE_CUDA + using memory_space = typename IlukHandle::memory_space; + size_t avail_byte = 0; + if (std::is_same::value) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, + total_byte); + avail_byte = static_cast(0.85 * free_byte); + } +#endif + + size_type maxrows = 0; + size_type maxrowsperchunk = 0; + for (size_type i = 0; i < nlevels; ++i) { + size_type lnrows = level_ptr(i + 1) - level_ptr(i); + if (maxrows < lnrows) { + maxrows = lnrows; + } +#ifdef KOKKOS_ENABLE_CUDA + size_t required_size = + static_cast(lnrows) * nrows * sizeof(nnz_lno_t); + if (std::is_same::value) { + lnchunks(i) = required_size / avail_byte + 1; + lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) + ? (lnrows / lnchunks(i)) + : (lnrows / lnchunks(i) + 1); + } else +#endif + { + lnchunks(i) = 1; + lnrowsperchunk(i) = lnrows; + } + if (maxrowsperchunk < static_cast(lnrowsperchunk(i))) { + maxrowsperchunk = lnrowsperchunk(i); + } + } + + thandle.set_num_levels(nlevels); + thandle.set_level_maxrows(maxrows); + thandle.set_level_maxrowsperchunk(maxrowsperchunk); + + level_nchunks = lnchunks; + level_nrowsperchunk = lnrowsperchunk; +} + // Linear Search for the smallest row index template size_type search_col_index(nnz_lno_t j, size_type lenl, ViewType h_iL, @@ -166,11 +268,11 @@ void iluk_symbolic(IlukHandle& thandle, // Scheduling and symbolic phase currently compute on host - need host copy // of all views - typedef typename IlukHandle::size_type size_type; - typedef typename IlukHandle::nnz_lno_t nnz_lno_t; + using size_type = typename IlukHandle::size_type; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; - typedef typename IlukHandle::nnz_lno_view_t HandleDeviceEntriesType; - typedef typename IlukHandle::nnz_row_view_t HandleDeviceRowMapType; + using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; + using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t; // typedef typename IlukHandle::signed_integral_t signed_integral_t; @@ -217,13 +319,14 @@ void iluk_symbolic(IlukHandle& thandle, // Can only resize managed views Kokkos::resize(L_entries_d, // L_entries_d.extent(0)-3); thandle.set_nnzL(L_entries_d.extent(0)+5); - typedef Kokkos::View - HostTmpViewType; + using HostTmpViewType = + Kokkos::View; HostTmpViewType h_lev("h_lev", thandle.get_nnzU()); HostTmpViewType h_iw("h_iw", nrows); HostTmpViewType h_iL("h_iL", nrows); HostTmpViewType h_llev("h_llev", nrows); + HostTmpViewType level_nchunks, level_nrowsperchunk; size_type cntL = 0; size_type cntU = 0; @@ -367,8 +470,31 @@ void iluk_symbolic(IlukHandle& thandle, } // Level scheduling on L - level_sched(thandle, L_row_map, L_entries, nrows, level_list, level_ptr, - level_idx, nlev); + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, level_nchunks, level_nrowsperchunk, nlev); + + thandle.alloc_level_nchunks(nlev); + thandle.alloc_level_nrowsperchunk(nlev); + HandleDeviceEntriesType dlevel_nchunks = thandle.get_level_nchunks(); + HandleDeviceEntriesType dlevel_nrowsperchunk = + thandle.get_level_nrowsperchunk(); + Kokkos::deep_copy(dlevel_nchunks, level_nchunks); + Kokkos::deep_copy(dlevel_nrowsperchunk, level_nrowsperchunk); + } else { + level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, nlev); + } + + Kokkos::deep_copy(dlevel_ptr, level_ptr); + Kokkos::deep_copy(dlevel_idx, level_idx); + Kokkos::deep_copy(dlevel_list, level_list); + + Kokkos::deep_copy(L_row_map_d, L_row_map); + Kokkos::deep_copy(L_entries_d, L_entries); + Kokkos::deep_copy(U_row_map_d, U_row_map); + Kokkos::deep_copy(U_entries_d, U_entries); thandle.set_symbolic_complete(); @@ -378,8 +504,10 @@ void iluk_symbolic(IlukHandle& thandle, std::cout << " symbolic complete: " << thandle.is_symbolic_complete() << std::endl; std::cout << " num levels: " << thandle.get_num_levels() << std::endl; - std::cout << " max num rows levels: " << thandle.get_level_maxrows() + std::cout << " max num rows among levels: " << thandle.get_level_maxrows() << std::endl; + std::cout << " max num rows among chunks among levels: " + << thandle.get_level_maxrowsperchunk() << std::endl; std::cout << " iluk_symbolic result: " << std::endl; @@ -427,15 +555,6 @@ void iluk_symbolic(IlukHandle& thandle, } std::cout << std::endl; #endif - - Kokkos::deep_copy(dlevel_ptr, level_ptr); - Kokkos::deep_copy(dlevel_idx, level_idx); - Kokkos::deep_copy(dlevel_list, level_list); - - Kokkos::deep_copy(L_row_map_d, L_row_map); - Kokkos::deep_copy(L_entries_d, L_entries); - Kokkos::deep_copy(U_row_map_d, U_row_map); - Kokkos::deep_copy(U_entries_d, U_entries); } } // end iluk_symbolic From 33d4cceb7abd2de015d4a65a7502f3a64b349f58 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 13 Apr 2022 16:35:34 -0600 Subject: [PATCH 02/11] Merge pull request #1380 from ndellingwood/improve-sptrsv-symb Improve sptrsv symb (cherry picked from commit 1764296492cad7e349c34fe06a3148e351fb8b28) --- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 172 ++++++------------ 1 file changed, 56 insertions(+), 116 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 4d11112493..3a6f988835 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -223,65 +223,33 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); Kokkos::deep_copy(level_list, dlevel_list); - HostSignedEntriesType previous_level_list( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"), - nrows); - Kokkos::deep_copy(previous_level_list, signed_integral_t(-1)); - - const bool stored_diagonal = thandle.is_stored_diagonal(); - // diagonal_offsets is uninitialized - deep_copy unnecessary at the - // beginning, only needed at the end - auto diagonal_offsets = thandle.get_diagonal_offsets(); - auto hdiagonal_offsets = thandle.get_host_diagonal_offsets(); - - size_type level = 0; - auto starting_node = 0; - auto ending_node = nrows; - - size_type node_count = 0; - - while (node_count < nrows) { - for (size_type row = starting_node; row < ending_node; ++row) { - if (level_list(row) == -1) { // unmarked - bool is_root = true; - signed_integral_t ptrstart = row_map(row); - signed_integral_t ptrend = row_map(row + 1); - - for (signed_integral_t offset = ptrstart; offset < ptrend; ++offset) { - size_type col = entries(offset); - if (previous_level_list(col) == -1 && col != row) { // unmarked - if (col < row) { - is_root = false; - break; - } - } else if (col == row) { - if (stored_diagonal) hdiagonal_offsets(row) = offset; - } else if (col > row) { - std::cout << "\nrow = " << row << " col = " << col - << " offset = " << offset << std::endl; - throw( - std::runtime_error("SYMB ERROR: Lower tri with colid > rowid " - "- SHOULD NOT HAPPEN!!!")); - } - } // end for offset , i.e. cols of this row - - if (is_root == true) { - level_list(row) = level; - nodes_per_level(level) += 1; - nodes_grouped_by_level(node_count) = row; - node_count += 1; - } - - } // end if - } // end for row - - // Kokkos::deep_copy(previous_level_list, level_list); - for (size_type i = 0; i < nrows; ++i) { - previous_level_list(i) = level_list(i); + signed_integral_t level = 0; + size_type node_count = 0; + + typename DeviceEntriesType::HostMirror level_ptr( + "lp", nrows + 1); // temp View used for index bookkeeping + level_ptr(0) = 0; + for (size_type i = 0; i < nrows; ++i) { + signed_integral_t l = 0; + size_type rowstart = row_map(i); + size_type rowend = row_map(i + 1); + for (size_type j = rowstart; j < rowend; j++) { + size_type col = entries(j); + l = std::max(l, level_list(col)); } - - level += 1; - } // end while + level_list(i) = l + 1; + nodes_per_level(l) += 1; // 0-based indexing + level_ptr(l + 1) += 1; + level = std::max(level, l + 1); + node_count++; + } + for (signed_integral_t i = 1; i <= level; ++i) { + level_ptr(i) += level_ptr(i - 1); + } + for (size_type i = 0; i < nrows; i++) { + nodes_grouped_by_level(level_ptr(level_list(i) - 1)) = i; + level_ptr(level_list(i) - 1) += 1; + } thandle.set_num_levels(level); @@ -320,9 +288,8 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); - if (stored_diagonal) Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets); - // Extra check: + // Extra check: #ifdef LVL_OUTPUT_INFO { std::cout << " End symb - extra checks" << std::endl; @@ -705,61 +672,35 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); Kokkos::deep_copy(level_list, dlevel_list); - HostSignedEntriesType previous_level_list( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"), - nrows); - Kokkos::deep_copy(previous_level_list, signed_integral_t(-1)); - - const bool stored_diagonal = thandle.is_stored_diagonal(); - // diagonal_offsets is uninitialized - deep_copy unnecessary at the - // beginning, only needed at the end - auto diagonal_offsets = thandle.get_diagonal_offsets(); - auto hdiagonal_offsets = thandle.get_host_diagonal_offsets(); - - size_type level = 0; - auto starting_node = nrows - 1; - auto ending_node = 0; - - size_type node_count = 0; - - while (node_count < nrows) { - for (signed_integral_t row = starting_node; row >= ending_node; --row) { - if (level_list(row) == -1) { // unmarked - bool is_root = true; - signed_integral_t ptrstart = row_map(row); - signed_integral_t ptrend = row_map(row + 1); - - for (signed_integral_t offset = ptrend - 1; offset >= ptrstart; - --offset) { - signed_integral_t col = entries(offset); - - if (previous_level_list(col) == -1 && col != row) { // unmarked - if (col > row) { - is_root = false; - break; - } - } else if (col == row) { - if (stored_diagonal) hdiagonal_offsets(row) = offset; - } - } // end for offset , i.e. cols of this row - - if (is_root == true) { - level_list(row) = level; - nodes_per_level(level) += 1; - nodes_grouped_by_level(node_count) = row; - node_count += 1; - } - - } // end if - } // end for row - - // Kokkos::deep_copy(previous_level_list, level_list); - for (size_type i = 0; i < nrows; ++i) { - previous_level_list(i) = level_list(i); + signed_integral_t level = 0; + size_type node_count = 0; + + typename DeviceEntriesType::HostMirror level_ptr( + "lp", nrows + 1); // temp View used for index bookkeeping + level_ptr(0) = 0; + for (size_type ii = nrows; ii > 0; ii--) { + size_type i = ii - 1; // Avoid >= 0 comparison in for-loop to prevent + // wraparound errors with unsigned types + signed_integral_t l = 0; + size_type rowstart = row_map(i) + 1; // skip diag + size_type rowend = row_map(i + 1); + for (size_type j = rowstart; j < rowend; ++j) { + size_type col = entries(j); + l = std::max(l, level_list(col)); } - - level += 1; - } // end while + level_list(i) = l + 1; + nodes_per_level(l) += 1; // 0-based indexing + level_ptr(l + 1) += 1; + level = std::max(level, l + 1); + node_count++; + } + for (signed_integral_t i = 1; i <= level; ++i) { + level_ptr(i) += level_ptr(i - 1); + } + for (size_type i = 0; i < nrows; i++) { + nodes_grouped_by_level(level_ptr(level_list(i) - 1)) = i; + level_ptr(level_list(i) - 1) += 1; + } thandle.set_num_levels(level); @@ -798,9 +739,8 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); - if (stored_diagonal) Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets); - // Extra check: + // Extra check: #ifdef LVL_OUTPUT_INFO { std::cout << " End symb - extra checks" << std::endl; From 77bc5fe88e40827bc5b332755df5ee287f0b62a2 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Mar 2022 13:37:25 -0700 Subject: [PATCH 03/11] Restore BLAS-1 MV paths for 1 column Also: test these paths, test nrm2w, and use 3-arg (async) deep copies in the >1 column paths of these kernels. (cherry picked from commit ef9f08b5029008bbef46ebf3b2473f5311598697) --- src/blas/impl/KokkosBlas1_dot_mv_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_dot_spec.hpp | 45 +++- src/blas/impl/KokkosBlas1_nrm1_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_nrm1_spec.hpp | 21 +- src/blas/impl/KokkosBlas1_nrm2_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_nrm2_spec.hpp | 22 +- src/blas/impl/KokkosBlas1_nrm2w_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_nrm2w_spec.hpp | 23 +- src/blas/impl/KokkosBlas1_sum_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_sum_spec.hpp | 21 +- unit_test/blas/Test_Blas.hpp | 1 + unit_test/blas/Test_Blas1_dot.hpp | 3 + unit_test/blas/Test_Blas1_nrm1.hpp | 3 + unit_test/blas/Test_Blas1_nrm2.hpp | 3 + unit_test/blas/Test_Blas1_nrm2_squared.hpp | 3 + unit_test/blas/Test_Blas1_nrm2w.hpp | 234 +++++++++++++++++++++ unit_test/blas/Test_Blas1_sum.hpp | 3 + 17 files changed, 370 insertions(+), 37 deletions(-) create mode 100644 unit_test/blas/Test_Blas1_nrm2w.hpp diff --git a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp index 500dc035ca..dfbae10a99 100644 --- a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -131,7 +131,8 @@ void MV_Dot_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerDot; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -156,7 +157,7 @@ void MV_Dot_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), r.extent(0)); MV_Dot_Invoke(tempResult, x, y); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_dot_spec.hpp b/src/blas/impl/KokkosBlas1_dot_spec.hpp index 350934230d..33c7603057 100644 --- a/src/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/src/blas/impl/KokkosBlas1_dot_spec.hpp @@ -377,6 +377,20 @@ struct Dot + static auto getFirstColumn( + const V& v, typename std::enable_if::type* = nullptr) { + return Kokkos::subview(v, Kokkos::ALL(), 0); + } + + template + static V getFirstColumn( + const V& v, typename std::enable_if::type* = nullptr) { + return v; + } + static void dot(const RV& R, const XV& X, const YV& Y) { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" @@ -392,14 +406,31 @@ struct Dot(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - typedef int index_type; - MV_Dot_Invoke(R, X, Y); + const size_type numDots = std::max(X.extent(1), Y.extent(1)); + if (numDots == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = getFirstColumn(X); + auto Y0 = getFirstColumn(Y); + if (numRows < static_cast(INT_MAX)) { + typedef int index_type; + DotFunctor f(X0, + Y0); + f.run("KokkosBlas::dot<1D>", R0); + } else { + typedef int64_t index_type; + DotFunctor f(X0, + Y0); + f.run("KokkosBlas::dot<1D>", R0); + } } else { - typedef std::int64_t index_type; - MV_Dot_Invoke(R, X, Y); + if (numRows < static_cast(INT_MAX) && + numRows * numDots < static_cast(INT_MAX)) { + typedef int index_type; + MV_Dot_Invoke(R, X, Y); + } else { + typedef std::int64_t index_type; + MV_Dot_Invoke(R, X, Y); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp index 07422035b7..2002ef2c39 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -170,7 +170,8 @@ void MV_Nrm1_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -195,7 +196,7 @@ void MV_Nrm1_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); MV_Nrm1_Invoke(tempResult, x); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp index df86d00fa2..478395d7a9 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -200,12 +200,23 @@ struct Nrm1 { : "KokkosBlas::nrm1[noETI]"); const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm1_Invoke(R, X); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm1_Invoke(R0, X0); + } else { + typedef std::int64_t index_type; + V_Nrm1_Invoke(R0, X0); + } } else { - typedef std::int64_t index_type; - MV_Nrm1_Invoke(R, X); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm1_Invoke(R, X); + } else { + typedef std::int64_t index_type; + MV_Nrm1_Invoke(R, X); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp index 4efc0e6c6d..f2b0e826bc 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -200,7 +200,8 @@ void MV_Nrm2_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -230,7 +231,7 @@ void MV_Nrm2_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); MV_Nrm2_Invoke(tempResult, x, take_sqrt); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp index 340d78fdf1..71afb2ede3 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -200,12 +200,24 @@ struct Nrm2 { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2_Invoke(R, X, take_sqrt); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm2_Invoke(R0, X0, take_sqrt); + } else { + typedef std::int64_t index_type; + V_Nrm2_Invoke(R0, X0, + take_sqrt); + } } else { - typedef std::int64_t index_type; - MV_Nrm2_Invoke(R, X, take_sqrt); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2_Invoke(R, X, take_sqrt); + } else { + typedef std::int64_t index_type; + MV_Nrm2_Invoke(R, X, take_sqrt); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 3013fd17f8..3f202ca430 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -199,7 +199,8 @@ void MV_Nrm2w_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -230,7 +231,7 @@ void MV_Nrm2w_Invoke( r.extent(0)); MV_Nrm2w_Invoke(tempResult, x, w, take_sqrt); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp index fe437bbc5c..28162bce5f 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -201,12 +201,25 @@ struct Nrm2w { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + if (numCols == 1) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + auto W0 = Kokkos::subview(W, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm2w_Invoke(R0, X0, W0, take_sqrt); + } else { + typedef std::int64_t index_type; + V_Nrm2w_Invoke(R0, X0, W0, + take_sqrt); + } } else { - typedef std::int64_t index_type; - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2w_Invoke(R, X, W, take_sqrt); + } else { + typedef std::int64_t index_type; + MV_Nrm2w_Invoke(R, X, W, take_sqrt); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_sum_impl.hpp b/src/blas/impl/KokkosBlas1_sum_impl.hpp index 05cede0f0d..b87f2e1092 100644 --- a/src/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/src/blas/impl/KokkosBlas1_sum_impl.hpp @@ -162,7 +162,8 @@ void MV_Sum_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -187,7 +188,7 @@ void MV_Sum_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), r.extent(0)); MV_Sum_Invoke(tempResult, x); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_sum_spec.hpp b/src/blas/impl/KokkosBlas1_sum_spec.hpp index 505296cab9..09c34299c7 100644 --- a/src/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/src/blas/impl/KokkosBlas1_sum_spec.hpp @@ -197,12 +197,23 @@ struct Sum { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Sum_Invoke(R, X); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Sum_Invoke(R0, X0); + } else { + typedef std::int64_t index_type; + V_Sum_Invoke(R0, X0); + } } else { - typedef std::int64_t index_type; - MV_Sum_Invoke(R, X); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Sum_Invoke(R, X); + } else { + typedef std::int64_t index_type; + MV_Sum_Invoke(R, X); + } } Kokkos::Profiling::popRegion(); } diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 642a0bf5f0..5244c35e53 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -15,6 +15,7 @@ #include "Test_Blas1_nrm1.hpp" #include "Test_Blas1_nrm2_squared.hpp" #include "Test_Blas1_nrm2.hpp" +#include "Test_Blas1_nrm2w.hpp" #include "Test_Blas1_nrminf.hpp" #include "Test_Blas1_reciprocal.hpp" #include "Test_Blas1_scal.hpp" diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index 920ac06c77..536e58486c 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -196,6 +196,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -207,6 +208,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -218,6 +220,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp index 72861bf5a3..c68492b6dd 100644 --- a/unit_test/blas/Test_Blas1_nrm1.hpp +++ b/unit_test/blas/Test_Blas1_nrm1.hpp @@ -149,6 +149,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif @@ -159,6 +160,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif @@ -169,6 +171,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp index 94d5414e15..688035f842 100644 --- a/unit_test/blas/Test_Blas1_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_nrm2.hpp @@ -144,6 +144,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif @@ -154,6 +155,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif @@ -164,6 +166,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index ca357acdb2..317b9b543b 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -160,6 +160,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif @@ -170,6 +171,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif @@ -180,6 +182,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2w.hpp b/unit_test/blas/Test_Blas1_nrm2w.hpp new file mode 100644 index 0000000000..cda59c83e4 --- /dev/null +++ b/unit_test/blas/Test_Blas1_nrm2w.hpp @@ -0,0 +1,234 @@ +#include +#include +#include +#include +#include + +namespace Test { +template +void impl_test_nrm2w(int N) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + ViewTypeA a("A", N); + ViewTypeA w("W", N); + + typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(h_w, w); + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + typename AT::mag_type expected_result = 0; + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + expected_result += term * term; + } + expected_result = + Kokkos::ArithTraits::sqrt(expected_result); + + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a, w); + EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); +} + +template +void impl_test_nrm2w_mv(int N, int K) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + typedef multivector_layout_adapter vfA_type; + + typename vfA_type::BaseType b_a("A", N, K); + typename vfA_type::BaseType b_w("W", N, K); + + ViewTypeA a = vfA_type::view(b_a); + ViewTypeA w = vfA_type::view(b_w); + + typedef multivector_layout_adapter h_vfA_type; + + typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + + typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(h_b_w, b_w); + + typename AT::mag_type* expected_result = new typename AT::mag_type[K]; + for (int j = 0; j < K; j++) { + expected_result[j] = typename AT::mag_type(); + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + expected_result[j] += term * term; + } + expected_result[j] = + Kokkos::ArithTraits::sqrt(expected_result[j]); + } + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + Kokkos::View r("Dot::Result", K); + KokkosBlas::nrm2w(r, a, w); + auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + + for (int k = 0; k < K; k++) { + typename AT::mag_type nonconst_result = r_host(k); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], + eps * expected_result[k]); + } + + delete[] expected_result; +} +} // namespace Test + +template +int test_nrm2w() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + + return 1; +} + +template +int test_nrm2w_mv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_float"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_float"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_double"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_double"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_complex_double"); + test_nrm2w, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_complex_double"); + test_nrm2w_mv, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_int"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_int"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp index 768091885c..2b7f51370e 100644 --- a/unit_test/blas/Test_Blas1_sum.hpp +++ b/unit_test/blas/Test_Blas1_sum.hpp @@ -133,6 +133,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif @@ -143,6 +144,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif @@ -153,6 +155,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif From 48b1756b71a324bda06a5f011b66b4d6853f3d9d Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Mar 2022 16:35:16 -0700 Subject: [PATCH 04/11] Fix types in test (cherry picked from commit 89111309f691fdd7783c283ca8ac5dbaa1d4fa1d) --- unit_test/blas/Test_Blas1_dot.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index 536e58486c..b2e3f95628 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -208,7 +208,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(789, 1); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -220,7 +220,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(789, 1); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif From 57ff6ff84fb5e65bd3f3cf6e09a13de0b64f918d Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Mar 2022 11:59:26 -0700 Subject: [PATCH 05/11] Fix nrm2w unification layer, add nrm2w_squared test (cherry picked from commit 1f7a45e00f5be82c87ff74bf14b7d217b37c985b) --- src/blas/KokkosBlas1_nrm2w.hpp | 26 ++- src/blas/KokkosBlas1_nrm2w_squared.hpp | 26 ++- unit_test/blas/Test_Blas.hpp | 1 + unit_test/blas/Test_Blas1_nrm2w_squared.hpp | 232 ++++++++++++++++++++ 4 files changed, 261 insertions(+), 24 deletions(-) create mode 100644 unit_test/blas/Test_Blas1_nrm2w_squared.hpp diff --git a/src/blas/KokkosBlas1_nrm2w.hpp b/src/blas/KokkosBlas1_nrm2w.hpp index 981897d9ae..43d32e7812 100644 --- a/src/blas/KokkosBlas1_nrm2w.hpp +++ b/src/blas/KokkosBlas1_nrm2w.hpp @@ -76,7 +76,8 @@ nrm2w(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -134,20 +135,21 @@ void nrm2w(const RV& R, const XMV& X, const XMV& W, KokkosKernels::Impl::throw_runtime_exception(os.str()); } + using UnifiedXLayout = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + RV, UnifiedXLayout>::array_layout; + // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; diff --git a/src/blas/KokkosBlas1_nrm2w_squared.hpp b/src/blas/KokkosBlas1_nrm2w_squared.hpp index 2ab07af0c5..6aec955de2 100644 --- a/src/blas/KokkosBlas1_nrm2w_squared.hpp +++ b/src/blas/KokkosBlas1_nrm2w_squared.hpp @@ -77,7 +77,8 @@ nrm2w_squared(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -135,20 +136,21 @@ void nrm2w_squared( KokkosKernels::Impl::throw_runtime_exception(os.str()); } + using UnifiedXLayout = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + RV, UnifiedXLayout>::array_layout; + // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 5244c35e53..16d54e3dce 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -15,6 +15,7 @@ #include "Test_Blas1_nrm1.hpp" #include "Test_Blas1_nrm2_squared.hpp" #include "Test_Blas1_nrm2.hpp" +#include "Test_Blas1_nrm2w_squared.hpp" #include "Test_Blas1_nrm2w.hpp" #include "Test_Blas1_nrminf.hpp" #include "Test_Blas1_reciprocal.hpp" diff --git a/unit_test/blas/Test_Blas1_nrm2w_squared.hpp b/unit_test/blas/Test_Blas1_nrm2w_squared.hpp new file mode 100644 index 0000000000..14f1c90766 --- /dev/null +++ b/unit_test/blas/Test_Blas1_nrm2w_squared.hpp @@ -0,0 +1,232 @@ +#include +#include +#include +#include +#include + +namespace Test { +template +void impl_test_nrm2w_squared(int N) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + ViewTypeA a("A", N); + ViewTypeA w("W", N); + + typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(h_w, w); + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + typename AT::mag_type expected_result = 0; + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + expected_result += term * term; + } + + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a, w); + EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); +} + +template +void impl_test_nrm2w_squared_mv(int N, int K) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + typedef multivector_layout_adapter vfA_type; + + typename vfA_type::BaseType b_a("A", N, K); + typename vfA_type::BaseType b_w("W", N, K); + + ViewTypeA a = vfA_type::view(b_a); + ViewTypeA w = vfA_type::view(b_w); + + typedef multivector_layout_adapter h_vfA_type; + + typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + + typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(h_b_w, b_w); + + typename AT::mag_type* expected_result = new typename AT::mag_type[K]; + for (int j = 0; j < K; j++) { + expected_result[j] = typename AT::mag_type(); + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + expected_result[j] += term * term; + } + } + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + Kokkos::View r("Dot::Result", K); + KokkosBlas::nrm2w_squared(r, a, w); + auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + + for (int k = 0; k < K; k++) { + typename AT::mag_type nonconst_result = r_host(k); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], + eps * expected_result[k]); + } + + delete[] expected_result; +} +} // namespace Test + +template +int test_nrm2w_squared() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + + return 1; +} + +template +int test_nrm2w_squared_mv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_float"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_float"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_double"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_double"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::nrm2w_squared_complex_double"); + test_nrm2w_squared, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::nrm2w_squared_mv_complex_double"); + test_nrm2w_squared_mv, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_int"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_int"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif From f32fa284c4b32a818d62daac28cd3c523172ae82 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 29 Apr 2022 13:41:30 -0600 Subject: [PATCH 06/11] Merge pull request #1394 from ndellingwood/issue-1367 Update View value_type and const_value_type for compile time checks (cherry picked from commit 1033a62b61c558753b5fe37022a4f71f41438f56) --- src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp b/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp index 2165387076..f3c6c6bb67 100644 --- a/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp +++ b/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp @@ -88,7 +88,8 @@ struct CrsMatrixGetDiagCopyWithOffsetsFunctor { static_cast(DiagType::rank) == 1, "The DiagType template parameter must be a 1-D Kokkos::View."); static_assert( - std::is_same::value, + std::is_same::value, "The DiagType template parameter must be a nonconst Kokkos::View."); static_assert(Kokkos::is_view::value, "The OffsetsType template parameter must be a Kokkos::View."); From 8c0d39f3d2c7193395c7b96d353b5b10ce8e2e8f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 23 May 2022 16:19:39 -0600 Subject: [PATCH 07/11] CMakeLists.txt: Update to version 3.6.01 for patch release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c4c8a3ccfa..ba5323df27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) SET(KokkosKernels_VERSION_MINOR 6) - SET(KokkosKernels_VERSION_PATCH 00) + SET(KokkosKernels_VERSION_PATCH 01) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") ENDIF() From e1d8de4273538d0c7b9f3cb532fd577e4c08a999 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 23 May 2022 16:25:11 -0600 Subject: [PATCH 08/11] Adding Changelog for Release 3.6.01 Part of Kokkos C++ Performance Portability Programming EcoSystem 3.6 --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a961701013..97dce4835d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Change Log +## [3.6.01](https://github.com/kokkos/kokkos-kernels/tree/3.6.01) (2022-05-23) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.00...3.6.01) + +### Bug Fixes and Improvements: + +- Improve spiluk numeric phase to avoid race conditions and processing in chunks [\#1390](https://github.com/kokkos/kokkos-kernels/pull/1390) +- Improve sptrsv symbolic phase performance (level scheduling) [\#1380](https://github.com/kokkos/kokkos-kernels/pull/1380) +- Restore BLAS-1 MV paths for 1 column [\#1354](https://github.com/kokkos/kokkos-kernels/pull/1354) +- Fix check that view has const type [\#1370](https://github.com/kokkos/kokkos-kernels/pull/1370) +- Fix check that view has const type part 2 [\#1394](https://github.com/kokkos/kokkos-kernels/pull/1394) + ## [3.6.00](https://github.com/kokkos/kokkos-kernels/tree/3.6.00) (2022-02-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.5.00...3.6.00) From b22bd37d9291b1b9e9a4032164cfb6fc079c66f5 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 23 May 2022 16:28:22 -0600 Subject: [PATCH 09/11] Update master_history for Kokkos 3.6.01 --- master_history.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/master_history.txt b/master_history.txt index 222913d92c..ddf9143c73 100644 --- a/master_history.txt +++ b/master_history.txt @@ -16,3 +16,4 @@ tag: 3.4.00 date: 04/26/2021 master: fe439b21 release: d3c33910 tag: 3.4.01 date: 05/20/2021 master: 564dccb3 release: 4c62eb86 tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4 +tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42 From 63be973508056def3534484e9c1878ba6873d28c Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 22 Jun 2022 11:57:25 -0600 Subject: [PATCH 10/11] Fix Trilinos issue #10612 ArithTraits::isNan and isInf, when building with HIP. --- src/Kokkos_ArithTraits.hpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index bf7235e507..05efc5a022 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -1284,10 +1284,7 @@ class ArithTraits > { } #else static bool isInf(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#endif - return isinf(real(x)) || isinf(imag(x)); + return Kokkos::Experimental::isinf(real(x)) || Kokkos::Experimental::isinf(imag(x)); } #endif #ifdef KOKKOS_ENABLE_SYCL @@ -1307,10 +1304,7 @@ class ArithTraits > { } #else static bool isNan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#endif - return isnan(real(x)) || isnan(imag(x)); + return Kokkos::Experimental::isnan(real(x)) || Kokkos::Experimental::isnan(imag(x)); } #endif static mag_type abs(const std::complex& x) { From 87174c302a82a5feb07e3bbfd68fd6479daa0d08 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 22 Jun 2022 20:40:54 -0600 Subject: [PATCH 11/11] apply clang-format to Kokkos_ArithTraits.hpp --- src/Kokkos_ArithTraits.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 05efc5a022..68bcdf79ea 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -1284,7 +1284,8 @@ class ArithTraits > { } #else static bool isInf(const std::complex& x) { - return Kokkos::Experimental::isinf(real(x)) || Kokkos::Experimental::isinf(imag(x)); + return Kokkos::Experimental::isinf(real(x)) || + Kokkos::Experimental::isinf(imag(x)); } #endif #ifdef KOKKOS_ENABLE_SYCL @@ -1304,7 +1305,8 @@ class ArithTraits > { } #else static bool isNan(const std::complex& x) { - return Kokkos::Experimental::isnan(real(x)) || Kokkos::Experimental::isnan(imag(x)); + return Kokkos::Experimental::isnan(real(x)) || + Kokkos::Experimental::isnan(imag(x)); } #endif static mag_type abs(const std::complex& x) {