diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 129fe368c1..6dbc3039da 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -28,12 +28,18 @@ #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV // Enable supernodal sptrsv -#include "KokkosBlas3_trsm.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosBatched_Util.hpp" #include "KokkosBlas2_team_gemv_spec.hpp" -#include "KokkosBatched_Trsm_Team_Impl.hpp" #endif +#include "KokkosBlas3_trsm.hpp" +#include "KokkosBatched_Trsv_Decl.hpp" +#include "KokkosBatched_Trsm_Team_Impl.hpp" +#include "KokkosBlas1_team_axpby.hpp" +#include "KokkosBlas1_axpby.hpp" +#include "KokkosBlas1_set.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBlas2_serial_gemv_impl.hpp" #define KOKKOSKERNELS_SPTRSV_TRILVLSCHED @@ -73,7 +79,7 @@ struct SptrsvWrap { using range_type = Kokkos::pair; // Tag structs - struct UnsortedTag {}; + struct UnsortedTag {}; // This doesn't appear to be supported struct LargerCutoffTag {}; struct UnsortedLargerCutoffTag {}; @@ -96,7 +102,9 @@ struct SptrsvWrap { }; /** - * Common base class for sptrsv functors + * Common base class for sptrsv functors that need to work for both + * point and block matrices. Default version does not support + * blocks */ template struct Common { @@ -107,6 +115,18 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; + using reftype = scalar_t &; + + struct SBlock { + template + KOKKOS_INLINE_FUNCTION SBlock(T, size_type, size_type) {} + + KOKKOS_INLINE_FUNCTION + scalar_t *data() { return nullptr; } + + static int shmem_size(size_type, size_type) { return 0; } + }; + Common(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) : row_map(row_map_), @@ -115,121 +135,349 @@ struct SptrsvWrap { lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_) { - KK_REQUIRE_MSG(!BlockEnabled, "Blocks are not yet supported."); - KK_REQUIRE_MSG(block_size_ == 0, "Blocks are not yet supported."); + KK_REQUIRE_MSG(block_size_ == 0, "Tried to use blocks with the unblocked Common?"); + } + + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return 0; } + + // lget + KOKKOS_INLINE_FUNCTION + scalar_t &lget(const size_type row) const { return lhs(row); } + + // rget + KOKKOS_INLINE_FUNCTION + scalar_t rget(const size_type row) const { return rhs(row); } + + // vget + KOKKOS_INLINE_FUNCTION + scalar_t vget(const size_type nnz) const { return values(nnz); } + + // lhs = (lhs + rhs) / diag (team) + KOKKOS_INLINE_FUNCTION + static void add_and_divide(const member_type &team, scalar_t &lhs_val, const scalar_t &rhs_val, + const scalar_t &diag_val) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { lhs_val = (lhs_val + rhs_val) / diag_val; }); + } + + // lhs = (lhs + rhs) / diag (serial) + KOKKOS_INLINE_FUNCTION + static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, const scalar_t &diag_val) { + lhs_val = (lhs_val + rhs_val) / diag_val; + } + }; + + // Partial specialization for block support + template + struct Common { + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + + using Block = Kokkos::View>; + + // const block + using CBlock = Kokkos::View>; + + // scratch block + using SBlock = Kokkos::View>; + + using Vector = Kokkos::View>; + + using CVector = Kokkos::View>; + + static constexpr size_type MAX_VEC_SIZE = 16; + static constexpr size_type BUFF_SIZE = 256; + + using reftype = Vector; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + size_type block_size; + size_type block_items; + + Common(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + block_size(block_size_), + block_items(block_size * block_size) { + KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); + } + + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + + // assign + template + KOKKOS_INLINE_FUNCTION static void assign(const View1 &lhs_, const View2 &rhs_) { + for (size_t i = 0; i < lhs_.size(); ++i) { + lhs_.data()[i] = rhs_.data()[i]; + } + } + + template + KOKKOS_INLINE_FUNCTION static void assign(const member_type &team, const View1 &lhs_, const View2 &rhs_) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, lhs_.size()), + [&](const size_type i) { lhs_.data()[i] = rhs_.data()[i]; }); + } + + // add. y += x + KOKKOS_INLINE_FUNCTION + static void add(const member_type &team, const CVector &x, const Vector &y) { + KokkosBlas::Experimental::axpy(team, 1.0, x, y); + } + + // serial add. y += x + KOKKOS_INLINE_FUNCTION + static void add(const CVector &x, const Vector &y) { KokkosBlas::serial_axpy(1.0, x, y); } + + // divide. b /= A (b = b * A^-1) + KOKKOS_INLINE_FUNCTION + static void divide(const member_type &team, const Vector &b, const CBlock &A) { + // Team-shared buffer. Use for team work. + const auto block_size_ = b.size(); + SBlock shared_buff(team.team_shmem(), block_size_, block_size_); + + // Need a temp block to do LU of A + Block LU(shared_buff.data(), block_size_, block_size_); + assign(team, LU, A); + team.team_barrier(); + KokkosBatched::TeamLU::invoke(team, LU); + + // A = LU + // A^-1 = U^-1 * L^-1 + // b = (b * U^-1) * L^-1, so do U trsv first + team.team_barrier(); + KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, + b); + + team.team_barrier(); + KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); + } + + // serial divide. b /= A (b = b * A^-1) + KOKKOS_INLINE_FUNCTION + static void divide(const Vector &b, const CBlock &A) { + // Thread-local buffers. Use for Serial (non-team) work + scalar_t buff[BUFF_SIZE]; + + // Need a temp block to do LU of A + const auto block_size_ = b.size(); + KK_KERNEL_REQUIRE_MSG(block_size_ <= MAX_VEC_SIZE, + "Max supported block size for range-policy is 16. Use team-policy alg if you need more."); + + Block LU(&buff[0], block_size_, block_size_); + assign(LU, A); + KokkosBatched::SerialLU::invoke(LU); + + // A = LU + // A^-1 = U^-1 * L^-1 + // b = (b * U^-1) * L^-1, so do U trsv first + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); } - struct ReduceSumFunctor { - const Common *m_obj; - const lno_t rowid; - lno_t diag; + // lget + KOKKOS_INLINE_FUNCTION + Vector lget(const size_type row) const { return Vector(lhs.data() + (row * block_size), block_size); } + + // rget + KOKKOS_INLINE_FUNCTION + CVector rget(const size_type row) const { return CVector(rhs.data() + (row * block_size), block_size); } + + // vget + KOKKOS_INLINE_FUNCTION + CBlock vget(const size_type block) const { + return CBlock(values.data() + (block * block_items), block_size, block_size); + } + + // lhs = (lhs + rhs) / diag + KOKKOS_INLINE_FUNCTION + static void add_and_divide(const member_type &team, const Vector &lhs_val, const CVector &rhs_val, + const CBlock &diag_val) { + add(team, rhs_val, lhs_val); + team.team_barrier(); + divide(team, lhs_val, diag_val); + } + + KOKKOS_INLINE_FUNCTION + static void add_and_divide(const Vector &lhs_val, const CVector &rhs_val, const CBlock &diag_val) { + add(rhs_val, lhs_val); + divide(lhs_val, diag_val); + } + }; + + /** + * Intermediate class that contains implementation that identical + * for blocked / non-blocked + */ + template + struct Intermediate : public Common { + using Base = Common; + + Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {} + + struct ReduceFunctorBasic { + const Base *m_obj; + + KOKKOS_INLINE_FUNCTION + ReduceFunctorBasic(const Base *obj, const lno_t = 0) : m_obj(obj) {} + + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const scalar_t &val, const scalar_t &lhs_col_val, scalar_t &accum) { + accum -= val * lhs_col_val; + } KOKKOS_INLINE_FUNCTION void operator()(size_type i, scalar_t &accum) const { const auto colid = m_obj->entries(i); - auto val = m_obj->values(i); - auto lhs_colid = m_obj->lhs(colid); - accum -= val * lhs_colid; - KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); + multiply_subtract(m_obj->vget(i), m_obj->lget(colid), accum); } }; - struct ReduceSumDiagFunctor { - const Common *m_obj; - const lno_t rowid; - lno_t diag; + struct ReduceFunctorBlock : public ReduceFunctorBasic { + using P = ReduceFunctorBasic; + + const size_type block_size; + const size_type b; + + KOKKOS_INLINE_FUNCTION + ReduceFunctorBlock(const Base *obj, const size_type block_size_, const size_type b_, const lno_t = 0) + : P(obj), block_size(block_size_), b(b_) {} KOKKOS_INLINE_FUNCTION void operator()(size_type i, scalar_t &accum) const { - const auto colid = m_obj->entries(i); - if (colid != rowid) { - auto val = m_obj->values(i); - auto lhs_colid = m_obj->lhs(colid); - accum -= val * lhs_colid; - } else { - diag = i; - } + const auto idx = i / block_size; + const auto colid = P::m_obj->entries(idx); + P::multiply_subtract(P::m_obj->vget(idx)(b, i % block_size), P::m_obj->lget(colid)(b), accum); } }; - KOKKOS_INLINE_FUNCTION - static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, const scalar_t &diag_val) { - lhs_val = (lhs_val + rhs_val) / diag_val; - } + /** + * If we want to support Unsorted, we'll need a Functor that returns the ptr + * of the diag item (colid == rowid). Possibly via multi-reduce? The UnsortedTag + * is defined above but no policies actually use it. + */ template KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, const int my_rank, const long node_count) const { - static_assert(!((!IsSerial && BlockEnabled) && UseThreadVec), - "ThreadVectorRanges are not yet supported for block-enabled"); static_assert(!(IsSerial && UseThreadVec), "Requested thread vector range in serial?"); + static_assert(IsSorted, "Unsorted is not yet supported."); - const auto rowid = nodes_grouped_by_level(my_rank + node_count); - const auto soffset = row_map(rowid); - const auto eoffset = row_map(rowid + 1); - const auto rhs_val = rhs(rowid); - scalar_t &lhs_val = lhs(rowid); + const auto rowid = Base::nodes_grouped_by_level(my_rank + node_count); + const auto soffset = Base::row_map(rowid); + const auto eoffset = Base::row_map(rowid + 1); + const auto rhs_val = Base::rget(rowid); // Set up range to auto-skip diag if is sorted const auto itr_b = soffset + (IsSorted ? (IsLower ? 0 : 1) : 0); const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); // We don't need the reducer to find the diag item if sorted - using reducer_t = std::conditional_t; - reducer_t rf{this, rowid, -1}; + typename Base::reftype lhs_val = Base::lget(rowid); + + const auto block_size_ = BlockEnabled ? Base::get_block_size() : 1; + (void)block_size_; // Some settings do not use this var if constexpr (IsSerial) { KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); - for (auto ptr = itr_b; ptr < itr_e; ++ptr) { - rf(ptr, lhs_val); + if constexpr (BlockEnabled) { + for (size_type b = 0; b < block_size_; ++b) { + ReduceFunctorBlock rf(this, block_size_, b, rowid); + for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { + rf(i, lhs_val(b)); + } + } + } else { + ReduceFunctorBasic rf(this, rowid); + for (size_type i = itr_b; i < itr_e; ++i) { + rf(i, lhs_val); + } } } else { KK_KERNEL_ASSERT_MSG(team != nullptr, "Cannot do team operations without team"); if constexpr (!UseThreadVec) { - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, lhs_val); + if constexpr (BlockEnabled) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size_), [&](size_type b) { + ReduceFunctorBlock rf(this, block_size_, b, rowid); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size_, itr_e * block_size_), rf, + lhs_val(b)); + }); + } else { + ReduceFunctorBasic rf(this, rowid); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, lhs_val); + } team->team_barrier(); } else { - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); + if constexpr (BlockEnabled) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(*team, block_size_), [&](size_type b) { + ReduceFunctorBlock rf(this, block_size_, b, rowid); + for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { + rf(i, lhs_val(b)); + } + }); + } else { + ReduceFunctorBasic rf(this, rowid); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); + } } } // If sorted, we already know the diag. Otherwise, get it from the reducer - rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; + const lno_t diag = IsLower ? eoffset - 1 : soffset; // At end, handle the diag element. We need to be careful to avoid race // conditions here. if constexpr (IsSerial) { // Serial case is easy, there's only 1 thread so just do the // add_and_divide - KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); - add_and_divide(lhs_val, rhs_val, values(rf.diag)); + KK_KERNEL_ASSERT_MSG(diag != -1, "Serial should always know diag"); + Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); } else { - if constexpr (IsSorted) { - // Parallel sorted case is complex. All threads know what the diag is. - // If we have a team sharing the work, we need to ensure only one - // thread performs the add_and_divide. - KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); - if constexpr (!UseThreadVec) { - Kokkos::single(Kokkos::PerTeam(*team), [&]() { add_and_divide(lhs_val, rhs_val, values(rf.diag)); }); - } else { - add_and_divide(lhs_val, rhs_val, values(rf.diag)); - } + // Parallel sorted case is complex. All threads know what the diag is. + // If we have a team sharing the work, we need to ensure only one + // thread performs the add_and_divide (except in BlockEnabled, then + // we can use team operations). + KK_KERNEL_ASSERT_MSG(diag != -1, "Sorted should always know diag"); + if constexpr (!UseThreadVec) { + Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(diag)); } else { - // Parallel unsorted case. Only one thread should know what the diag - // item is. We have that one do the add_and_divide. - if (rf.diag != -1) { - add_and_divide(lhs_val, rhs_val, values(rf.diag)); - } + Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); } } } }; + // + // TriLvlSched functors + // + template struct TriLvlSchedTP1SolverFunctor - : public Common { - using Base = Common; + : public Intermediate { + using Base = Intermediate; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset @@ -251,13 +499,11 @@ struct SptrsvWrap { } }; - // Lower vs Upper Multi-block Functors - template struct TriLvlSchedRPSolverFunctor - : public Common { - using Base = Common; + : public Intermediate { + using Base = Intermediate; TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, @@ -275,8 +521,8 @@ struct SptrsvWrap { template struct TriLvlSchedTP1SingleBlockFunctor - : public Common { - using Base = Common; + : public Intermediate { + using Base = Intermediate; entries_t nodes_per_level; @@ -336,6 +582,10 @@ struct SptrsvWrap { void operator()(const UnsortedLargerCutoffTag &, const member_type &team) const { common_impl(team); } }; + // + // Supernodal functors + // + #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) // ----------------------------------------------------------- // Helper functors for Lower-triangular solve with SpMV @@ -920,7 +1170,7 @@ struct SptrsvWrap { #define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ Functor - template + template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) @@ -934,12 +1184,12 @@ struct SptrsvWrap { const auto hnodes_per_level = thandle.get_host_nodes_per_level(); const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); const auto block_size = thandle.get_block_size(); - const auto block_enabled = false; // thandle.is_block_enabled(); - assert(block_size == 0); + const auto block_enabled = thandle.is_block_enabled(); + assert(block_enabled == BlockEnabled); // Set up functor types - using LowerRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, false); - using LowerTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, false); + using LowerRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, BlockEnabled); + using LowerTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1012,51 +1262,12 @@ struct SptrsvWrap { int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = LowerTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require(tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), ltpp); } - // TP2 algorithm has issues with some offset-ordinal combo to be - // addressed - /* - else if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef - Kokkos::TeamPolicy tvt_policy_type; - - int team_size = thandle.get_team_size(); - if ( team_size == -1 ) { - team_size = std::is_same< typename - Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : - 64; - } - int vector_size = thandle.get_team_size(); - if ( vector_size == -1 ) { - vector_size = std::is_same< typename - Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : - 4; - } - - // This impl: "chunk" lvl_nodes into node_groups; a league_rank is - responsible for processing team_size # nodes - // TeamThreadRange over number nodes of node_groups - // To avoid masking threads, 1 thread (team) per node in - node_group (thread has full ownership of a node) - // ThreadVectorRange responsible for the actual solve - computation - //const int node_groups = team_size; - const int node_groups = vector_size; - - #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, true, node_count, vector_size, 0); #else - LowerTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, node_count, node_groups); #endif - Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( - (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size - ), tstf); } // end elseif - */ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || @@ -1251,7 +1462,7 @@ struct SptrsvWrap { #endif } // end lower_tri_solve - template + template static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) @@ -1267,12 +1478,12 @@ struct SptrsvWrap { auto hnodes_per_level = thandle.get_host_nodes_per_level(); auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); const auto block_size = thandle.get_block_size(); - const auto block_enabled = false; // thandle.is_block_enabled(); - assert(block_size == 0); + const auto block_enabled = thandle.is_block_enabled(); + assert(block_enabled == BlockEnabled); // Set up functor types - using UpperRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, false); - using UpperTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, false); + using UpperRPFunc = FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, BlockEnabled); + using UpperTPFunc = FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, BlockEnabled); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -1344,50 +1555,12 @@ struct SptrsvWrap { int team_size = thandle.get_team_size(); auto tp = team_size == -1 ? team_policy(space, lvl_nodes, Kokkos::AUTO) : team_policy(space, lvl_nodes, team_size); + const int scratch_size = UpperTPFunc::SBlock::shmem_size(block_size, block_size); + tp = tp.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for( "parfor_u_team", Kokkos::Experimental::require(tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), utpp); } - // TP2 algorithm has issues with some offset-ordinal combo to be - // addressed - /* - else if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { -typedef Kokkos::TeamPolicy tvt_policy_type; - - int team_size = thandle.get_team_size(); - if ( team_size == -1 ) { - team_size = std::is_same< typename - Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace ->::value ? 1 : 64; - } - int vector_size = thandle.get_team_size(); - if ( vector_size == -1 ) { - vector_size = std::is_same< typename -Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4; - } - - // This impl: "chunk" lvl_nodes into node_groups; a league_rank is -responsible for processing that many nodes - // TeamThreadRange over number nodes of node_groups - // To avoid masking threads, 1 thread (team) per node in -node_group (thread has full ownership of a node) - // ThreadVectorRange responsible for the actual solve computation - //const int node_groups = team_size; - const int node_groups = vector_size; - -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, -nodes_grouped_by_level, false, node_count, vector_size, 0); #else - UpperTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, -nodes_grouped_by_level, node_count, node_groups); #endif - - Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( -(int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), -tstf); } // end elseif - */ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || @@ -1790,10 +1963,7 @@ tstf); } // end elseif Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? + node_count += lvl_nodes; } // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the @@ -1868,4 +2038,6 @@ tstf); } // end elseif } // namespace Impl } // namespace KokkosSparse +#undef FunctorTypeMacro + #endif diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index df078518ec..87cf72686c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -103,7 +103,8 @@ struct SPTRSV_SOLVE; // Call specific algorithm type - auto sptrsv_handle = handle->get_sptrsv_handle(); + auto sptrsv_handle = handle->get_sptrsv_handle(); + const auto block_enabled = sptrsv_handle->is_block_enabled(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() ? "KokkosSparse_sptrsv[lower]" : "KokkosSparse_sptrsv[upper]"); if (sptrsv_handle->is_lower_tri()) { @@ -120,7 +121,13 @@ struct SPTRSV_SOLVE(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); + { + if (block_enabled) { + Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); + } else { + Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); + } + } } } else { if (sptrsv_handle->is_symbolic_complete() == false) { @@ -136,7 +143,13 @@ struct SPTRSV_SOLVE(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); + { + if (block_enabled) { + Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); + } else { + Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); + } + } } } Kokkos::Profiling::popRegion(); diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 69f6a5dd5e..a4b62a28ba 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -44,6 +44,7 @@ template class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: using ScalarType = typename std::remove_const::type; + using size_type = typename CRS::size_type; using EXSP = typename CRS::execution_space; using MEMSP = typename CRS::memory_space; using DEVICE = typename Kokkos::Device; @@ -60,12 +61,12 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: //! Constructor: template - LUPrec(const CRSArg &L, const CRSArg &U) + LUPrec(const CRSArg &L, const CRSArg &U, const size_type block_size = 0) : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numPointRows()), _tmp2("LUPrec::_tmp", L.numPointRows()), _khL(), _khU() { KK_REQUIRE_MSG(L.numPointRows() == U.numPointRows(), "LUPrec: L.numRows() != U.numRows()"); - _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), true); - _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), false); + _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), true, block_size); + _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), false, block_size); } //! Destructor. @@ -74,52 +75,6 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { _khU.destroy_sptrsv_handle(); } - template ::value>::type * = nullptr> - void apply_impl(const Kokkos::View &X, const Kokkos::View &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { - // tmp = trsv(L, x); //Apply L^inv to x - // y = trsv(U, tmp); //Apply U^inv to tmp - - KK_REQUIRE_MSG(transM[0] == NoTranspose[0], "LUPrec::apply only supports 'N' for transM"); - - sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); - sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); - - sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); - sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, _tmp2); - - KokkosBlas::axpby(alpha, _tmp2, beta, Y); - } - - template ::value>::type * = nullptr> - void apply_impl(const Kokkos::View &X, const Kokkos::View &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { - // tmp = trsv(L, x); //Apply L^inv to x - // y = trsv(U, tmp); //Apply U^inv to tmp - - KK_REQUIRE_MSG(transM[0] == NoTranspose[0], "LUPrec::apply only supports 'N' for transM"); - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - using Layout = Kokkos::LayoutLeft; -#else - using Layout = Kokkos::LayoutRight; -#endif - - // trsv is implemented for MV so we need to convert our views - using UView2d = typename Kokkos::View >; - using UView2dc = typename Kokkos::View >; - UView2dc X2d(X.data(), X.extent(0), 1); - UView2d Y2d(Y.data(), Y.extent(0), 1), tmp2d(_tmp.data(), _tmp.extent(0), 1), - tmp22d(_tmp2.data(), _tmp2.extent(0), 1); - - KokkosSparse::trsv("L", "N", "N", _L, X2d, tmp2d); - KokkosSparse::trsv("U", "N", "N", _U, tmp2d, tmp22d); - - KokkosBlas::axpby(alpha, _tmp2, beta, Y); - } - ///// \brief Apply the preconditioner to X, putting the result in Y. ///// ///// \tparam XViewType Input vector, as a 1-D Kokkos::View @@ -134,7 +89,15 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { virtual void apply(const Kokkos::View &X, const Kokkos::View &Y, const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { - apply_impl(X, Y, transM, alpha, beta); + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], "LUPrec::apply only supports 'N' for transM"); + + sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); + sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); + + sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, _tmp2); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); } //@} diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index 51a39a999b..b6ca3dacfd 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -930,6 +930,7 @@ class SPTRSVHandle { KOKKOS_INLINE_FUNCTION void set_block_size(const size_type block_size_) { this->block_size = block_size_; } + bool is_block_enabled() const { return block_size > 0; } void set_symbolic_complete() { this->symbolic_complete = true; } void set_symbolic_incomplete() { this->symbolic_complete = false; } diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 287110f665..1fde1ac5ed 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -740,7 +740,7 @@ struct SpilukTest { gmres_handle->set_verbose(verbose); // Make precond. - KokkosSparse::Experimental::LUPrec myPrec(L, U); + KokkosSparse::Experimental::LUPrec myPrec(L, U, UseBlocks ? block_size : 0); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 663f9d83ef..385367bca2 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -70,6 +70,13 @@ struct SptrsvTest { return A; } + static std::vector> get_6x6_ut_ones_fixture() { + std::vector> A = {{1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, {0.00, 1.00, 0.00, 0.00, 0.00, 1.00}, + {0.00, 0.00, 1.00, 1.00, 0.00, 1.00}, {0.00, 0.00, 0.00, 1.00, 0.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 1.00, 1.00}, {0.00, 0.00, 0.00, 0.00, 0.00, 1.00}}; + return A; + } + static std::vector> get_5x5_ut_fixture() { const auto KZ = KEEP_ZERO(); std::vector> A = {{5.00, 1.00, 1.00, 0.00, KZ}, @@ -99,6 +106,13 @@ struct SptrsvTest { return A; } + static std::vector> get_6x6_lt_ones_fixture() { + std::vector> A = {{1.00, 0.00, 0.00, 0.00, 0.00, 0.00}, {1.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 0.00, 0.00, 0.00}, {0.00, 0.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 1.00, 1.00, 0.00}, {0.00, 1.00, 1.00, 1.00, 1.00, 1.00}}; + return A; + } + static bool do_cusparse() { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE return (std::is_same::value && std::is_same::value && @@ -472,6 +486,21 @@ struct SptrsvTest { } } + static void run_test_sptrsv_blocks_impl(const bool is_lower, const size_type block_size) { + auto fixture = is_lower ? get_6x6_lt_ones_fixture() : get_6x6_ut_ones_fixture(); + const auto [triMtx_crs, lhs, rhs] = create_crs_lhs_rhs(fixture); + + Bsr triMtx(triMtx_crs, block_size); + basic_check(triMtx, lhs, rhs, is_lower, block_size); + } + + static void run_test_sptrsv_blocks() { + for (size_type block_size : {1, 2, 3}) { + run_test_sptrsv_blocks_impl(true, block_size); + run_test_sptrsv_blocks_impl(false, block_size); + } + } + static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams, const bool is_lower) { // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition @@ -564,6 +593,7 @@ template ; TestStruct::run_test_sptrsv(); + TestStruct::run_test_sptrsv_blocks(); } template