From 815eb0951d9857a84012f65d68729b9e0a28059a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 3 Apr 2023 21:39:01 -0600 Subject: [PATCH 001/231] BLAS2 syr() implementation and tests. About 85% done. --- blas/CMakeLists.txt | 7 + .../syr/KokkosBlas2_syr_eti_spec_inst.cpp.in | 25 + .../KokkosBlas2_syr_eti_spec_avail.hpp.in | 25 + .../KokkosBlas2_syr_eti_spec_decl.hpp.in | 25 + blas/impl/KokkosBlas2_syr_impl.hpp | 340 ++++ blas/impl/KokkosBlas2_syr_spec.hpp | 208 +++ blas/src/KokkosBlas2_syr.hpp | 133 ++ blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp | 127 ++ blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp | 35 + .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 356 ++++ .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 386 ++++ .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 401 +++++ blas/tpls/KokkosBlas_Host_tpl.cpp | 177 ++ blas/tpls/KokkosBlas_Host_tpl.hpp | 27 + blas/unit_test/Test_Blas.hpp | 1 + blas/unit_test/Test_Blas2_syr.hpp | 1558 +++++++++++++++++ 16 files changed, 3831 insertions(+) create mode 100644 blas/eti/generated_specializations_cpp/syr/KokkosBlas2_syr_eti_spec_inst.cpp.in create mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_avail.hpp.in create mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in create mode 100644 blas/impl/KokkosBlas2_syr_impl.hpp create mode 100644 blas/impl/KokkosBlas2_syr_spec.hpp create mode 100644 blas/src/KokkosBlas2_syr.hpp create mode 100644 blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp create mode 100644 blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp create mode 100644 blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp create mode 100644 blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp create mode 100644 blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp create mode 100644 blas/unit_test/Test_Blas2_syr.hpp diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 04f883c21a..d6ce98dae9 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -297,6 +297,13 @@ KOKKOSKERNELS_GENERATE_ETI(Blas2_ger ger TYPE_LISTS FLOATS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Blas2_syr syr + COMPONENTS blas + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm COMPONENTS blas HEADER_LIST ETI_HEADERS diff --git a/blas/eti/generated_specializations_cpp/syr/KokkosBlas2_syr_eti_spec_inst.cpp.in b/blas/eti/generated_specializations_cpp/syr/KokkosBlas2_syr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..00cbe2f171 --- /dev/null +++ b/blas/eti/generated_specializations_cpp/syr/KokkosBlas2_syr_eti_spec_inst.cpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosBlas2_syr_spec.hpp" + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR_ETI_INST_BLOCK@ +} //IMPL +} //Kokkos diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..d789bcd6ef --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR_ETI_AVAIL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..4b66faf5b2 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp new file mode 100644 index 0000000000..db9f8c24a5 --- /dev/null +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -0,0 +1,340 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_IMPL_HPP_ +#define KOKKOSBLAS2_SYR_IMPL_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosBlas { +namespace Impl { + +// Functor for a single-level parallel_for version of nontranspose SYR. +// The functor parallelizes over rows of the input matrix A. +template +struct SingleLevelSYR { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using A_value_type = typename AViewType::non_const_value_type; + + SingleLevelSYR( const bool justTranspose + , const bool justUp + , const AlphaCoeffType & alpha + , const XViewType & x + , const AViewType & A + ) + : justTranspose_(justTranspose) + , justUp_ (justUp) + , alpha_ (alpha) + , x_ (x) + , A_ (A) + { + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer."); + } + + KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { // AquiEEP + using KAT = Kokkos::Details::ArithTraits; + + if (alpha_ == KAT::zero()) { + // Nothing to do + } + else { + const IndexType N ( A_.extent(1) ); + const A_value_type x_fixed( x_(i) ); + + if (justTranspose_) { + for (IndexType j = 0; j < N; ++j) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += A_value_type( alpha_ * x_fixed * x_(j) ); + } + } + } + else { + for (IndexType j = 0; j < N; ++j) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( x_(j) ) ); + } + } + } + } + } + +private: + bool justTranspose_; + bool justUp_; + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + AViewType A_; +}; + +// Single-level parallel version of SYR. +template +void singleLevelSyr( const typename AViewType::execution_space & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL singleLevelSyr(), AViewType = %s\n", typeid(AViewType).name() ); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer"); + + using KAT = Kokkos::Details::ArithTraits; + + if (x.extent(0) == 0) { + // no entries to update + } + else if (alpha == KAT::zero()) { + // no entries to update + } + else { + using execution_space = typename AViewType::execution_space; + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + SingleLevelSYR functor( (trans[0] == 'T') || (trans[0] == 't') + , (uplo[0] == 'U') || (uplo[0] == 'u') + , alpha + , x + , A + ); + Kokkos::parallel_for("KokkosBlas::syr[SingleLevel]", rangePolicy, functor); + } +} + +struct TwoLevelSYR_LayoutLeftTag {}; +struct TwoLevelSYR_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- + +// Functor for a two-level parallel_reduce version of SYR, designed for performance on GPU. +// Kernel depends on the layout of A. +template +struct TwoLevelSYR { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using A_value_type = typename AViewType::non_const_value_type; + + using execution_space = typename AViewType::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TwoLevelSYR( const bool justTranspose + , const bool justUp + , const AlphaCoeffType & alpha + , const XViewType & x + , const AViewType & A + ) + : justTranspose_(justTranspose) + , justUp_ (justUp) + , alpha_ (alpha) + , x_ (x) + , A_ (A) + { + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer."); + } + +public: + // LayoutLeft version: one team per column + KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutLeftTag // AquiEEP + , const member_type & team + ) const { + using KAT = Kokkos::Details::ArithTraits; + + if (alpha_ == KAT::zero()) { + // Nothing to do + } + else { + const IndexType M ( A_.extent(0) ); + const IndexType j ( team.league_rank() ); + if (justTranspose_) { + const A_value_type x_fixed( x_(j) ); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += A_value_type( alpha_ * x_(i) * x_fixed ); + } + }); + } + else { + const A_value_type x_fixed( KAT::conj( x_(j) ) ); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += A_value_type( alpha_ * x_(i) * x_fixed ); + } + }); + } + } + } + + // LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutRightTag // AquiEEP + , const member_type & team + ) const { + using KAT = Kokkos::Details::ArithTraits; + + if (alpha_ == KAT::zero()) { + // Nothing to do + } + else { + const IndexType N ( A_.extent(1) ); + const IndexType i ( team.league_rank() ); + const A_value_type x_fixed( x_(i) ); + if (justTranspose_) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += A_value_type( alpha_ * x_fixed * x_(j) ); + } + }); + } + else { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( x_(j) ) ); + } + }); + } + } + team.team_barrier(); + } + +private: + bool justTranspose_; + bool justUp_; + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + AViewType A_; +}; + +// Two-level parallel version of SYR. +template +void twoLevelSyr( const typename AViewType::execution_space & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL twoLevelSyr(), AViewType = %s\n", typeid(AViewType).name() ); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer"); + + using KAT = Kokkos::Details::ArithTraits; + + if (x.extent(0) == 0) { + // no entries to update + return; + } + else if (alpha == KAT::zero()) { + // no entries to update + return; + } + + using execution_space = typename AViewType::execution_space; + constexpr bool isLayoutLeft = std::is_same::value; + using layout_tag = typename std::conditional::type; + using TeamPolicyType = Kokkos::TeamPolicy; + TeamPolicyType teamPolicy; + if (isLayoutLeft) { + // LayoutLeft: one team per column + teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); + } + else { + // LayoutRight: one team per row + teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); + } + + TwoLevelSYR functor( (trans[0] == 'T') || (trans[0] == 't') + , (uplo[0] == 'U') || (uplo[0] == 'u') + , alpha + , x + , A + ); + Kokkos::parallel_for("KokkosBlas::syr[twoLevel]", teamPolicy, functor); +} + +// --------------------------------------------------------------------------------------------- + +// generalSyr: use 1 level (Range) or 2 level (Team) implementation, +// depending on whether execution space is CPU or GPU. +// The 'enable_if' makes sure unused kernels are not instantiated. + +template < class XViewType + , class AViewType + , class IndexType + , typename std::enable_if() >::type* = nullptr + > +void generalSyrImpl( const typename AViewType::execution_space & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalSyrImpl(CPU), AViewType = %s\n", typeid(AViewType).name() ); + singleLevelSyr(space, trans, uplo, alpha, x, A); +} + +template < class XViewType + , class AViewType + , class IndexType + , typename std::enable_if()>::type* = nullptr + > +void generalSyrImpl( const typename AViewType::execution_space & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalSyrImpl(GPU), AViewType = %s\n", typeid(AViewType).name() ); + twoLevelSyr(space, trans, uplo, alpha, x, A); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp new file mode 100644 index 0000000000..027c31d692 --- /dev/null +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -0,0 +1,208 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_SPEC_HPP_ +#define KOKKOSBLAS2_SYR_SPEC_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization availability KokkosBlas::Impl::SYR. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _INST macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_eti_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// +// syr +// + +// Implementation of KokkosBlas::syr. +template < class XViewType + , class AViewType + , bool tpl_spec_avail = syr_tpl_spec_avail::value + , bool eti_spec_avail = syr_eti_spec_avail::value + > +struct SYR { + static void syr( const typename AViewType::execution_space & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering KokkosBlas::Impl::Syr::syr()\n" ); + + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + if ((trans[0] == 'T') || + (trans[0] == 't') || + (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } + else { + std::ostringstream oss; + oss << "In impl of KokkosBlas2::syr(): invalid trans[0] = " << trans[0]; + throw std::runtime_error(oss.str()); + } + + if ((uplo[0] == 'U') || + (uplo[0] == 'u') || + (uplo[0] == 'L') || + (uplo[0] == 'l')) { + // Ok + } + else { + std::ostringstream oss; + oss << "In impl of KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0]; + throw std::runtime_error(oss.str()); + } + + if (A.extent(0) != x.extent(0)) { + std::ostringstream oss; + oss << "In impl of KokkosBlas2::syr(): A.extent(0) = " << A.extent(0) + << ", but x.extent(0) = " << x.extent(0); + throw std::runtime_error(oss.str()); + } + + if (A.extent(1) != x.extent(0)) { + std::ostringstream oss; + oss << "In impl of KokkosBlas2::syr(): A.extent(1) = " << A.extent(1) + << ", but x.extent(0) = " << x.extent(0); + throw std::runtime_error(oss.str()); + } + + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr[ETI]" : "KokkosBlas::syr[noETI]"); + + typedef typename AViewType::size_type size_type; + const size_type numRows = A.extent(0); + const size_type numCols = A.extent(1); + + // Prefer int as the index type, but use a larsyr type if needed. + if (( numRows < static_cast(INT_MAX) ) && + ( numCols < static_cast(INT_MAX) )) { + generalSyrImpl( space + , trans + , uplo + , alpha + , x + , A + ); + } + else { + generalSyrImpl( space + , trans + , uplo + , alpha + , x + , A + ); + } + + Kokkos::Profiling::popRegion(); + } +#else + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; + +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization of KokkosBlas::Impl::SYR. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , false \ + , true \ + >; + +#define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , false \ + , true \ + >; + +#include +#include + +#endif // KOKKOSBLAS2_SYR_SPEC_HPP_ diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp new file mode 100644 index 0000000000..68d30deccd --- /dev/null +++ b/blas/src/KokkosBlas2_syr.hpp @@ -0,0 +1,133 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_HPP_ +#define KOKKOSBLAS2_SYR_HPP_ + +#include + +namespace KokkosBlas { + +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param space [in] Execution space instance on which to run the kernel. +/// This may contain information about which stream to +/// run on. +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr( const typename AViewType::execution_space & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering SRC KokkosBlas::syr(), AViewType = %s\n", typeid(AViewType).name() ); + + static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); + static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); + + static_assert( static_cast(AViewType::rank) == 2, "AViewType must have rank 2." ); + static_assert( static_cast(XViewType::rank) == 1, "XViewType must have rank 1." ); + + // Check compatibility of dimensions at run time. + if (( A.extent(0) != x.extent(0) ) || + ( A.extent(1) != x.extent(0) )) { + std::ostringstream os; + os << "KokkosBlas::syr: Dimensions of A, x: " + << "A is " << A.extent(0) << " by " << A.extent(1) + << ", x has size " << x.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + using ALayout = typename AViewType::array_layout; + + // Minimize the number of Impl::SYR instantiations, by standardizing + // on particular View specializations for its template parameters. + typedef Kokkos::View< typename XViewType::const_value_type* + , typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout + , typename XViewType::device_type + , Kokkos::MemoryTraits + > XVT; + + typedef Kokkos::View< typename AViewType::non_const_value_type** + , ALayout + , typename AViewType::device_type + , Kokkos::MemoryTraits + > AVT; + + if (( A.extent(0) == 0 ) || + ( A.extent(1) == 0 )) { + // For degenerate cases, use fallback implementation to avoid potential + // (unlikely?) circular dependence issues by including other KokkosBlas + // headers. + const bool eti_spec_avail = KokkosBlas::Impl::syr_eti_spec_avail::value; + Impl::SYR::syr( space + , trans + , uplo + , alpha + , x + , A + ); + } + else { + Impl::SYR::syr( space + , trans + , uplo + , alpha + , x + , A + ); + } +} + +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr( const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) { + const typename AViewType::execution_space space = typename AViewType::execution_space(); + syr( space + , trans + , uplo + , alpha + , x + , A + ); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp new file mode 100644 index 0000000000..c0317cdbc8 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -0,0 +1,127 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side BLAS (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTX, LAYOUTA, MEMSPACE) \ + template \ + struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTX, LAYOUTA, MEMSPACE) \ + template \ + struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template <> \ + struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device< Kokkos::Experimental::HIP, \ + , Kokkos::Experimental::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device< Kokkos::Experimental::HIP \ + , Kokkos::Experimental::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex , Kokkos::LayoutRight) + +#endif +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp new file mode 100644 index 0000000000..1480bb1655 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp @@ -0,0 +1,35 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_HPP_ + +// BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +#include +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include +#endif + +#endif diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp new file mode 100644 index 0000000000..7438af7035 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -0,0 +1,356 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_BLAS_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_BLAS_HPP_ + +#include "KokkosBlas_Host_tpl.hpp" + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + if (( trans[0] == 'T' ) || \ + ( trans[0] == 't' ) || \ + ( trans[0] == 'H' ) || \ + ( trans[0] == 'h' )) { \ + } \ + else { \ + throw std::runtime_error("Error: invalid 'trans' for HostBlas::syr()"); \ + } \ + if (( uplo[0] == 'U' ) || \ + ( uplo[0] == 'u' ) || \ + ( uplo[0] == 'L' ) || \ + ( uplo[0] == 'l' )) { \ + } \ + else { \ + throw std::runtime_error("Error: invalid 'uplo' for HostBlas::syr()"); \ + } + +#define KOKKOSBLAS2_DSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const double* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ + if (A_is_ll) { \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + else { \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const float* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ + if (A_is_ll) { \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + else { \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::syru( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + HostBlas>::syrc( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + HostBlas>::syru( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyrc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: blasZsyrc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::syru( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + HostBlas>::syrc( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + HostBlas>::syru( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyrc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: blasCsyrc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp new file mode 100644 index 0000000000..f5bdc9d2c7 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -0,0 +1,386 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_CUBLAS_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_CUBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + if (( trans[0] == 'T' ) || \ + ( trans[0] == 't' ) || \ + ( trans[0] == 'H' ) || \ + ( trans[0] == 'h' )) { \ + } \ + else { \ + throw std::runtime_error("Error: invalid 'trans' for cudaBlas::syr()"); \ + } \ + if (( uplo[0] == 'U' ) || \ + ( uplo[0] == 'u' ) || \ + ( uplo[0] == 'L' ) || \ + ( uplo[0] == 'l' )) { \ + } \ + else { \ + throw std::runtime_error("Error: invalid 'uplo' for cudaBlas::syr()"); \ + } + +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const double* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const float* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyrc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: cublasZsyrc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct SYR< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCsyrc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: cublasCsyrc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp new file mode 100644 index 0000000000..fe29211161 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -0,0 +1,401 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_ROCBLAS_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_ROCBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + if (( trans[0] == 'T' ) || \ + ( trans[0] == 't' ) || \ + ( trans[0] == 'H' ) || \ + ( trans[0] == 'h' )) { \ + } \ + else { \ + throw std::runtime_error( "Error: invalid 'trans' for rocBlas::syr()"); \ + } \ + if (( uplo[0] == 'U' ) || \ + ( uplo[0] == 'u' ) || \ + ( uplo[0] == 'L' ) || \ + ( uplo[0] == 'l' )) { \ + } \ + else { \ + throw std::runtime_error( "Error: invalid 'uplo' for rocBlas::syr()"); \ + } + +#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< Kokkos::View< const double* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< Kokkos::View< const float* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const float* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZsyrc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: rocblasZsyrc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyru( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCsyrc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: rocblasCgec() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 37733f609e..44fc134434 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -249,6 +249,56 @@ void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, const std::complex*, int*, std::complex*, int*); +/// +/// Syr +/// +void F77_BLAS_MANGLE(ssyr, SSYR)( const char* + , int* + , const float* + , const float* + , int* + , float* + , int* + ); +void F77_BLAS_MANGLE(dsyr, DSYR)( const char* + , int* + , const double* + , const double* + , int* + , double* + , int* + ); +void F77_BLAS_MANGLE(csyru, CSYRU)( const char* + , int* + , const std::complex* + , const std::complex* + , int* + , std::complex* + , int* + ); +void F77_BLAS_MANGLE(csyrc, CSYRC)( const char* + , int* + , const std::complex* + , const std::complex* + , int* + , std::complex* + , int* + ); +void F77_BLAS_MANGLE(zsyru, ZSYRU)( int* + , const std::complex* + , const std::complex* + , int* + , std::complex* + , int* + ); +void F77_BLAS_MANGLE(zsyrc, ZSYRC)( int* + , const std::complex* + , const std::complex* + , int* + , std::complex* + , int* + ); + /// /// Trsv /// @@ -470,6 +520,13 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_ZGERU F77_BLAS_MANGLE(zgeru, ZGERU) #define F77_FUNC_ZGERC F77_BLAS_MANGLE(zgerc, ZGERC) +#define F77_FUNC_SSYR F77_BLAS_MANGLE(ssyr, SSYR) +#define F77_FUNC_DSYR F77_BLAS_MANGLE(dsyr, DSYR) +#define F77_FUNC_CSYRU F77_BLAS_MANGLE(csyru, CSYRU) +#define F77_FUNC_CSYRC F77_BLAS_MANGLE(csyrc, CSYRC) +#define F77_FUNC_ZSYRU F77_BLAS_MANGLE(zsyru, ZSYRU) +#define F77_FUNC_ZSYRC F77_BLAS_MANGLE(zsyrc, ZSYRC) + #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) #define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV) @@ -577,6 +634,24 @@ void HostBlas::ger(int m, int n, const float alpha, const float* x, F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> +void HostBlas::syr( const char uplo + , int n + , const float alpha + , const float* x + , int incx + , float* a + , int lda + ) { + F77_FUNC_SSYR( &uplo + , &n + , &alpha + , x + , &incx + , a + , &lda + ); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const float* a, int lda, /* */ float* b, int ldb) { @@ -696,6 +771,24 @@ void HostBlas::ger(int m, int n, const double alpha, const double* x, F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> +void HostBlas::syr( const char uplo + , int n + , const double alpha + , const double* x + , int incx + , double* a + , int lda + ) { + F77_FUNC_DSYR( &uplo + , &n + , &alpha + , x + , &incx + , a + , &lda + ); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const double* a, int lda, /* */ double* b, int ldb) { @@ -842,6 +935,48 @@ void HostBlas >::gerc( &lda); } template <> +void HostBlas >::syru( const char uplo + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , std::complex* a + , int lda + ) { +#if 0 // AquiEEP + std::string trans("T"); + F77_FUNC_CGERU( &trans[0] + , &n + , &n + , &alpha + , (const std::complex*)x + , &incx + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); +#endif +} +template <> +void HostBlas >::syrc( const char uplo + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , std::complex* a + , int lda + ) { + F77_FUNC_CHER( &uplo + , &n + , &alpha + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, int lda, @@ -1012,6 +1147,48 @@ void HostBlas >::gerc( (std::complex*)a, &lda); } template <> +void HostBlas >::syru( const char uplo + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , std::complex* a + , int lda + ) { +#if 0 // AquiEEP + std::string trans("T"); + F77_FUNC_ZGERU( &trans[0] + , &n + , &n + , &alpha + , (const std::complex*)x + , &incx + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); +#endif +} +template <> +void HostBlas >::syrc( const char uplo + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , std::complex* a + , int lda + ) { + F77_FUNC_ZHER( &uplo + , &n + , &alpha + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index cd53537ea6..5772ada279 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -73,6 +73,33 @@ struct HostBlas { static void gerc(int m, int n, const T alpha, const T *x, int incx, const T *y, int incy, T *a, int lda); + static void syr( const char uplo + , int n + , const T alpha + , const T* x + , int incx + , T* a + , int lda + ); + + static void syru( const char uplo + , int n + , const T alpha + , const T* x + , int incx + , T* a + , int lda + ); + + static void syrc( const char uplo + , int n + , const T alpha + , const T* x + , int incx + , T* a + , int lda + ); + static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, /* */ T *b, int ldb); diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index ff955d13a8..077c7eb870 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -62,6 +62,7 @@ // Blas 2 #include "Test_Blas2_gemv.hpp" #include "Test_Blas2_ger.hpp" +#include "Test_Blas2_syr.hpp" // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp new file mode 100644 index 0000000000..711b757151 --- /dev/null +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -0,0 +1,1558 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include + +namespace Test { + +constexpr double piVal = 3.14159265358979323846; + +template +class SyrTester +{ +public: + SyrTester(); + + ~SyrTester(); + + void test( const int N + , const int nonConstConstCombinations + , const bool useAnalyticalResults = false + , const bool useHermitianOption = false + , const bool useUpOption = false + ); + +private: + typedef Kokkos::View _ViewTypeX; + typedef Kokkos::View _ViewTypeA; + + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef Kokkos::View _ViewTypeExpected; + + typedef Kokkos::ArithTraits _KAT_A; + typedef typename _KAT_A::mag_type _AuxType; + + void populateVariables( ScalarA & alpha + , _HostViewTypeX & h_x + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + , _ViewTypeX & x + , _ViewTypeA & A + , bool & expectedResultIsKnown + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ); + + template + T shrinkAngleToZeroTwoPiRange(const T input); + + template + void callKkSyrAndCompareAgainstExpected( const ScalarA & alpha + , TX & x + , _ViewTypeA & A + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + , const std::string & situation + ); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; + const _AuxType _epsAbs; + const _AuxType _epsRel; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _useUpOption; + bool _kkSyrShouldThrowException; +}; + +template +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::SyrTester() + : _A_is_complex ( std::is_same>::value || std::is_same>::value ) + , _A_is_lr ( std::is_same< tLayoutA, Kokkos::LayoutRight >::value ) + , _A_is_ll ( std::is_same< tLayoutA, Kokkos::LayoutLeft >::value ) + , _testIsGpu ( KokkosKernels::Impl::kk_is_gpu_exec_space< typename Device::execution_space >() ) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + , _vanillaUsesDifferentOrderOfOps( _A_is_lr && _testIsGpu ) +#else + , _vanillaUsesDifferentOrderOfOps( false ) +#endif + , _epsAbs (std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9) + , _epsRel (std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6) + , _M (-1) + , _N (-1) + , _useAnalyticalResults (false) + , _useHermitianOption (false) + , _useUpOption (false) + , _kkSyrShouldThrowException (false) +{ +} + +template +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::~SyrTester() +{ + // Nothing to do +} + +template +void SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::test( const int N + , const int nonConstConstCombinations + , const bool useAnalyticalResults + , const bool useHermitianOption + , const bool useUpOption + ) +{ + std::cout << "Entering SyrTester::test()... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps + << ", _epsAbs = " << _epsAbs + << ", _epsRel = " << _epsRel + << std::endl; + + // ******************************************************************** + // Step 1 of 7: declare main types and variables + // ******************************************************************** + _M = N; + _N = N; + _useAnalyticalResults = useAnalyticalResults; + _useHermitianOption = useHermitianOption; + _useUpOption = useUpOption; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + _kkSyrShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + if ((_testIsGpu == false) && + (_A_is_ll == false)) { + _kkSyrShouldThrowException = true; + } + else if ((_testIsGpu == true ) && + (_A_is_ll == false)) { + _kkSyrShouldThrowException = true; + } + } +#endif + + bool test_x (false); + bool test_cx(false); + if (nonConstConstCombinations == 0) { + test_x = true; + } + else if (nonConstConstCombinations == 1) { + test_cx = true; + } + else { + test_x = true; + test_cx = true; + } + + _ViewTypeX x("X", _M); + _ViewTypeA A("A", _M, _N); + + typename _ViewTypeX::const_type c_x = x; + + _HostViewTypeX h_x = Kokkos::create_mirror_view(x); + _HostViewTypeA h_A = Kokkos::create_mirror_view(A); + + _ViewTypeExpected h_expected("expected A += alpha * x * x^{t,h}", _M, _N); + bool expectedResultIsKnown = false; + + ScalarA alpha(0.); + + // ******************************************************************** + // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A + // ******************************************************************** + this->populateVariables( alpha + , h_x + , h_A + , h_expected + , x + , A + , expectedResultIsKnown + ); + + // ******************************************************************** + // Step 3 of 7: populate h_vanilla + // ******************************************************************** + _ViewTypeExpected h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name() ); + this->populateVanillaValues( alpha + , h_x + , h_A + , h_vanilla + ); + + // ******************************************************************** + // Step 4 of 7: use h_vanilla and h_expected as appropriate + // ******************************************************************** + if (expectedResultIsKnown) { + // ****************************************************************** + // Compare h_vanilla against h_expected + // ****************************************************************** + this->compareVanillaExpected( alpha + , h_vanilla + , h_expected + ); + } + else { + // ****************************************************************** + // Copy h_vanilla to h_expected + // ****************************************************************** + Kokkos::deep_copy(h_expected, h_vanilla); + } + + // ******************************************************************** + // Step 5 of 7: test with 'non const x' + // ******************************************************************** + _ViewTypeA org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A, A); + + if (test_x) { + this->callKkSyrAndCompareAgainstExpected( alpha + , x + , A + , h_A + , h_expected + , "non const x" + ); + } + + // ******************************************************************** + // Step 6 of 7: test with const x + // ******************************************************************** + if (test_cx) { + Kokkos::deep_copy(A, org_A); + + this->callKkSyrAndCompareAgainstExpected( alpha + , c_x + , A + , h_A + , h_expected + , "const x" + ); + } + + // ******************************************************************** + // Step 7 of 7: tests with invalid values on the first input parameter + // ******************************************************************** + EXPECT_ANY_THROW( KokkosBlas::syr(".", "U", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW( KokkosBlas::syr("", "U", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for mode ''"; + EXPECT_ANY_THROW( KokkosBlas::syr("T", ".", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for uplo '.'"; + EXPECT_ANY_THROW( KokkosBlas::syr("T", "", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for uplo ''"; + + std::cout << "Leaving SyrTester::test() - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; +} + +template +void SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::populateVariables( ScalarA & alpha + , _HostViewTypeX & h_x + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + , _ViewTypeX & x + , _ViewTypeA & A + , bool & expectedResultIsKnown + ) +{ + expectedResultIsKnown = false; + + if (_useAnalyticalResults) { + this->populateAnalyticalValues( alpha + , h_x + , h_A + , h_expected + ); + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(A, h_A); + + expectedResultIsKnown = true; + } + else if (_N == 1) { + alpha = 3; + + h_x[0] = 2; + + h_A(0,0) = 7; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(A, h_A); + + h_expected(0,0) = 19; + expectedResultIsKnown = true; + } + else if (_N == 2) { + alpha = 3; + + h_x[0] = -2; + h_x[1] = 9; + + h_A(0,0) = 17; + h_A(0,1) = -43; + h_A(1,0) = -43; + h_A(1,1) = 101; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(A, h_A); + + if (_useUpOption) { + h_expected(0,0) = 29; + h_expected(0,1) = -97; + h_expected(1,0) = -43; + h_expected(1,1) = 344; + } + else { + h_expected(0,0) = 29; + h_expected(0,1) = -43; + h_expected(1,0) = -97; + h_expected(1,1) = 344; + } + expectedResultIsKnown = true; + } + else { + alpha = 3; + + Kokkos::Random_XorShift64_Pool rand_pool(13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + } + + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(h_A, A); + } +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ) { + _AuxType auxI(0.); + _AuxType auxJ(0.); + _AuxType auxIpJ(0.); + _AuxType auxImJ(0.); + + alpha.real() = 1.; + alpha.imag() = -1.; + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + h_x[i].real() = sin(auxI); + h_x[i].imag() = cos(auxI); + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_A(i,j).real() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + h_A(i,j).imag() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + } + } + } + } + else { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); + h_A(i,j).real() = -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + h_A(i,j).imag() = -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + } + } + } + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_expected(i,j).real() = -2. * sin(auxI) * sin(auxJ); + h_expected(i,j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); + } + } + } + } + else { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); + h_expected(i,j).real() = 2. * cos(auxI) * cos(auxJ); + h_expected(i,j).imag() = -2. * sin(auxImJ); + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ) { + _AuxType auxI(0.); + _AuxType auxJ(0.); + _AuxType auxIpJ(0.); + + alpha = 3; + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + h_x[i] = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + h_A(i,j) = 3 * cos(auxI) * sin(auxJ); + } + } + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_expected(i,j) = 3 * sin(auxIpJ); + } + } + } +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ) { + if (_vanillaUsesDifferentOrderOfOps) { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + h_vanilla(i,j) = h_A(i,j) + alpha * _KAT_A::conj( h_x(j) ) * h_x(i); + } + else { + h_vanilla(i,j) = h_A(i,j); + } + } + } + } + else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(j) * h_x(i); + } + else { + h_vanilla(i,j) = h_A(i,j); + } + } + } + } + } + else { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * _KAT_A::conj( h_x(j) ); + } + else { + h_vanilla(i,j) = h_A(i,j); + } + } + } + } + else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_x(j); + } + else { + h_vanilla(i,j) = h_A(i,j); + } + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ) { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(j) * h_x(i); + } + else { + h_vanilla(i,j) = h_A(i,j); + } + } + } + } + else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_x(j); + } + else { + h_vanilla(i,j) = h_A(i,j); + } + } + } + } +} + +template +template +T SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::shrinkAngleToZeroTwoPiRange(const T input) +{ + T output(input); +#if 0 + T twoPi( 2. * piVal ); + if (input > 0.) { + output -= std::floor( input / twoPi ) * twoPi; + } + else if (input < 0.) { + output += std::floor( -input / twoPi ) * twoPi; + } +#endif + return output; +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + if (_useAnalyticalResults) { + int numErrorsRealAbs (0); + int numErrorsRealRel (0); + int numErrorsImagAbs (0); + int numErrorsImagRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRealRel (0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel (0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()); + errorHappened = false; + if (h_expected(i,j).real() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsRealAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i,j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + + diff = _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()); + errorHappened = false; + if (h_expected(i,j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsImagAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_vanilla(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_vanilla(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } + else { + int numErrorsReal(0); + int numErrorsImag(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if ( h_expected(i,j).real() != h_vanilla(i,j).real() ) { + if (numErrorsReal == 0) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i,j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() + << std::endl; + } + numErrorsReal++; + } + + if ( h_expected(i,j).imag() != h_vanilla(i,j).imag() ) { + if (numErrorsImag == 0) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() + << std::endl; + } + numErrorsImag++; + } + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + if (_useAnalyticalResults) { + int numErrorsAbs (0); + int numErrorsRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRel (0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)); + errorHappened = false; + if (h_expected(i,j) == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i,j) + << ", h_vanilla(i,j) = " << h_vanilla(i,j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_vanilla(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } + else { + int numErrors(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if ( h_expected(i,j) != h_vanilla(i,j) ) { + if (numErrors == 0) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i,j) + << ", h_vanilla(i,j) = " << h_vanilla(i,j) + << std::endl; + } + numErrors++; + } + } // for j + } // for i + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; + } +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + int numErrorsRealAbs (0); + int numErrorsRealRel (0); + int numErrorsImagAbs (0); + int numErrorsImagRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRealRel (0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel (0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()); + errorHappened = false; + if (h_expected(i,j).real() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i,j).real() + << ", h_A(i,j).real() = " << h_A(i,j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + + diff = _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()); + errorHappened = false; + if (h_expected(i,j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() + << ", h_A(i,j).imag() = " << h_A(i,j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + std::cout << "A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed + << std::endl; + if ((_M == 2131) && (_N == 2131)) { + std::cout << "Information" + << ": A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_expected(11, 2119) = (" << h_expected(11,2119).real() << ", " << h_expected(11,2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11,2119).real() << ", " << h_A(11,2119).imag() << ")" + << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_expected(710, 1065) = (" << h_expected(710,1065).real() << ", " << h_expected(710,1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710,1065).real() << ", " << h_A(710,1065).imag() << ")" + << std::endl; + } + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + int numErrorsAbs (0); + int numErrorsRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRel (0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j) - h_A(i,j)); + errorHappened = false; + if (h_expected(i,j) == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i,j) + << ", h_A(i,j) = " << h_A(i,j) + << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + std::cout << "A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed + << std::endl; + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +template +template +void SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::callKkSyrAndCompareAgainstExpected( const ScalarA & alpha + , TX & x + , _ViewTypeA & A + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + , const std::string & situation + ) +{ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkSyrShouldThrowException ); + std::string mode = _useHermitianOption ? "H" : "T"; + std::string uplo = _useUpOption ? "U" : "L"; + bool gotStdException (false); + bool gotUnknownException(false); + try { + KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A); + } + catch( const std::exception& e ) { + std::cout << "In Test_Blas2_syr, '" << situation << "': caught exception, e.what() = " << e.what() << std::endl; + gotStdException = true; + } + catch( ... ) { + std::cout << "In Test_Blas2_syr, '" << situation << "': caught unknown exception" << std::endl; + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation << "': unknown exception should not have happened"; + + EXPECT_EQ(gotStdException, _kkSyrShouldThrowException) << "Failed test, '" << situation << "': kk syr() should" + << (_kkSyrShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; + + if (( gotStdException == false ) && + ( gotUnknownException == false )) { + Kokkos::deep_copy(h_A, A); + + this->compareKokkosExpected( alpha + , h_A + , h_expected + ); + } +} + +} // namespace Test + +template +int test_syr( const std::string & caseName ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s ...\n", caseName.c_str() ); + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTLEFT ...\n", caseName.c_str() ); + + if (true) { + Test::SyrTester tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + //tester.test(1024, 0 , true, false, false); + //tester.test(1024, 0 , true, false, true); + //tester.test(1024, 0 , true, true, false); + //tester.test(1024, 0 , true, true, true); + tester.test(50, 4 ); + tester.test(1024, 0); + tester.test(2131, 0); + //tester.test(2131, 0 , true, false, true); + //tester.test(2131, 0 , true, true, true); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTLEFT\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTRIGHT ...\n", caseName.c_str() ); + + if (true) { + Test::SyrTester tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + //tester.test(1024, 0, true, false, false); + //tester.test(1024, 0, true, false, true); + //tester.test(1024, 0, true, true, false); + //tester.test(1024, 0, true, true, true); + tester.test(50, 4); + tester.test(1024, 0); + tester.test(2131, 0); + //tester.test(2131, 0, true, false, true); + //tester.test(2131, 0, true, true, true); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTRIGHT\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str() ); + + if (true) { + Test::SyrTester tester; + tester.test(0, 0); + tester.test(13, 0); + tester.test(1024, 0); + //tester.test(1024, 0, true, false, false); + //tester.test(1024, 0, true, false, true); + //tester.test(1024, 0, true, true, false); + //tester.test(1024, 0, true, true, true); + tester.test(50, 4); + tester.test(1024, 0); + tester.test(2131, 0); + //tester.test(2131, 0, true, false, true); + //tester.test(2131, 0, true, true, true); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTSTRIDE\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for MIXED LAYOUTS ...\n", caseName.c_str() ); + + if (true) { + Test::SyrTester tester; + tester.test(1024, 0); + //tester.test(1024, 0, true, false, true); + //tester.test(1024, 0, true, true, true); + } + + if (true) { + Test::SyrTester tester; + tester.test(1024, 0); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for MIXED LAYOUTS\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_float"); + test_syr( "test case syr_float" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if 1 + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_complex_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_float"); + test_syr, Kokkos::complex, TestExecSpace>( "test case syr_complex_float" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double"); + test_syr( "test case syr_double" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_double"); + test_syr, Kokkos::complex, TestExecSpace>( "test case syr_complex_double" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int"); + test_syr( "test case syr_int" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, syr_double_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double_int"); + test_syr( "test case syr_mixed_types" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#endif // if 1 From a3dda18bd6e239f86225c8d97a5d52562f45d139 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 3 Apr 2023 23:01:50 -0600 Subject: [PATCH 002/231] Compiled and tested in weaver. All the tests passing in the Mac are also passing in weaver. Still have to enable some tests. --- blas/impl/KokkosBlas2_syr_impl.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index db9f8c24a5..7f0ad5ae6c 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -55,7 +55,7 @@ struct SingleLevelSYR { } KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { // AquiEEP - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (alpha_ == KAT::zero()) { // Nothing to do @@ -110,7 +110,7 @@ void singleLevelSyr( const typename AViewType::execution_space & space static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (x.extent(0) == 0) { // no entries to update @@ -173,7 +173,7 @@ struct TwoLevelSYR { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutLeftTag // AquiEEP , const member_type & team ) const { - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (alpha_ == KAT::zero()) { // Nothing to do @@ -206,7 +206,7 @@ struct TwoLevelSYR { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutRightTag // AquiEEP , const member_type & team ) const { - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (alpha_ == KAT::zero()) { // Nothing to do @@ -262,7 +262,7 @@ void twoLevelSyr( const typename AViewType::execution_space & space static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (x.extent(0) == 0) { // no entries to update From 6b5c1c8db42ad22c1ef3ef6e176b685f493b0b0d Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 04:01:24 -0600 Subject: [PATCH 003/231] Changes following feedbacks of Luc and Kim for the BLAS2 ger() pull request --- blas/impl/KokkosBlas2_syr_impl.hpp | 128 +++++++----------- blas/impl/KokkosBlas2_syr_spec.hpp | 83 +++--------- blas/src/KokkosBlas2_syr.hpp | 89 +++++++----- blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp | 56 ++++---- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 50 +++---- blas/unit_test/Test_Blas2_syr.hpp | 7 +- 6 files changed, 180 insertions(+), 233 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 7f0ad5ae6c..9a6d0428a9 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -27,11 +27,11 @@ namespace Impl { // Functor for a single-level parallel_for version of nontranspose SYR. // The functor parallelizes over rows of the input matrix A. -template +template struct SingleLevelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; - using A_value_type = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; SingleLevelSYR( const bool justTranspose , const bool justUp @@ -45,30 +45,22 @@ struct SingleLevelSYR { , x_ (x) , A_ (A) { - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - - static_assert(std::is_integral::value, "IndexType must be an integer."); + // Nothing to do } KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { // AquiEEP - using KAT = Kokkos::ArithTraits; - - if (alpha_ == KAT::zero()) { + if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { - const IndexType N ( A_.extent(1) ); - const A_value_type x_fixed( x_(i) ); + const IndexType N ( A_.extent(1) ); + const XComponentType x_fixed( x_(i) ); if (justTranspose_) { for (IndexType j = 0; j < N; ++j) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += A_value_type( alpha_ * x_fixed * x_(j) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * x_(j) ); } } } @@ -76,7 +68,7 @@ struct SingleLevelSYR { for (IndexType j = 0; j < N; ++j) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( x_(j) ) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( x_(j) ) ); } } } @@ -92,9 +84,9 @@ struct SingleLevelSYR { }; // Single-level parallel version of SYR. -template -void singleLevelSyr( const typename AViewType::execution_space & space +void singleLevelSyr( const ExecutionSpace & space , const char trans[] , const char uplo[] , const typename AViewType::const_value_type & alpha @@ -102,25 +94,19 @@ void singleLevelSyr( const typename AViewType::execution_space & space , const AViewType & A ) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL singleLevelSyr(), AViewType = %s\n", typeid(AViewType).name() ); - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::ArithTraits; + using AlphaCoeffType = typename AViewType::non_const_value_type; if (x.extent(0) == 0) { // no entries to update } - else if (alpha == KAT::zero()) { + else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - using execution_space = typename AViewType::execution_space; - Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); SingleLevelSYR functor( (trans[0] == 'T') || (trans[0] == 't') , (uplo[0] == 'U') || (uplo[0] == 'u') , alpha @@ -138,14 +124,14 @@ struct TwoLevelSYR_LayoutRightTag {}; // Functor for a two-level parallel_reduce version of SYR, designed for performance on GPU. // Kernel depends on the layout of A. -template +template struct TwoLevelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; - using A_value_type = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelSYR( const bool justTranspose , const bool justUp @@ -159,13 +145,7 @@ struct TwoLevelSYR { , x_ (x) , A_ (A) { - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - - static_assert(std::is_integral::value, "IndexType must be an integer."); + // Nothing to do } public: @@ -173,29 +153,27 @@ struct TwoLevelSYR { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutLeftTag // AquiEEP , const member_type & team ) const { - using KAT = Kokkos::ArithTraits; - - if (alpha_ == KAT::zero()) { + if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { const IndexType M ( A_.extent(0) ); const IndexType j ( team.league_rank() ); if (justTranspose_) { - const A_value_type x_fixed( x_(j) ); + const XComponentType x_fixed( x_(j) ); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += A_value_type( alpha_ * x_(i) * x_fixed ); + A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); } }); } else { - const A_value_type x_fixed( KAT::conj( x_(j) ) ); + const XComponentType x_fixed( Kokkos::ArithTraits::conj( x_(j) ) ); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += A_value_type( alpha_ * x_(i) * x_fixed ); + A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); } }); } @@ -206,20 +184,18 @@ struct TwoLevelSYR { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutRightTag // AquiEEP , const member_type & team ) const { - using KAT = Kokkos::ArithTraits; - - if (alpha_ == KAT::zero()) { + if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { - const IndexType N ( A_.extent(1) ); - const IndexType i ( team.league_rank() ); - const A_value_type x_fixed( x_(i) ); + const IndexType N ( A_.extent(1) ); + const IndexType i ( team.league_rank() ); + const XComponentType x_fixed( x_(i) ); if (justTranspose_) { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += A_value_type( alpha_ * x_fixed * x_(j) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * x_(j) ); } }); } @@ -227,7 +203,7 @@ struct TwoLevelSYR { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( x_(j) ) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( x_(j) ) ); } }); } @@ -244,9 +220,9 @@ struct TwoLevelSYR { }; // Two-level parallel version of SYR. -template -void twoLevelSyr( const typename AViewType::execution_space & space +void twoLevelSyr( const ExecutionSpace & space , const char trans[] , const char uplo[] , const typename AViewType::const_value_type & alpha @@ -254,29 +230,23 @@ void twoLevelSyr( const typename AViewType::execution_space & space , const AViewType & A ) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL twoLevelSyr(), AViewType = %s\n", typeid(AViewType).name() ); - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::ArithTraits; + using AlphaCoeffType = typename AViewType::non_const_value_type; if (x.extent(0) == 0) { // no entries to update return; } - else if (alpha == KAT::zero()) { + else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update return; } - using execution_space = typename AViewType::execution_space; constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = typename std::conditional::type; - using TeamPolicyType = Kokkos::TeamPolicy; + using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { // LayoutLeft: one team per column @@ -287,12 +257,12 @@ void twoLevelSyr( const typename AViewType::execution_space & space teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TwoLevelSYR functor( (trans[0] == 'T') || (trans[0] == 't') - , (uplo[0] == 'U') || (uplo[0] == 'u') - , alpha - , x - , A - ); + TwoLevelSYR functor( (trans[0] == 'T') || (trans[0] == 't') + , (uplo[0] == 'U') || (uplo[0] == 'u') + , alpha + , x + , A + ); Kokkos::parallel_for("KokkosBlas::syr[twoLevel]", teamPolicy, functor); } @@ -302,12 +272,13 @@ void twoLevelSyr( const typename AViewType::execution_space & space // depending on whether execution space is CPU or GPU. // The 'enable_if' makes sure unused kernels are not instantiated. -template < class XViewType +template < class ExecutionSpace + , class XViewType , class AViewType , class IndexType - , typename std::enable_if() >::type* = nullptr + , typename std::enable_if() >::type* = nullptr > -void generalSyrImpl( const typename AViewType::execution_space & space +void generalSyrImpl( const ExecutionSpace & space , const char trans[] , const char uplo[] , const typename AViewType::const_value_type & alpha @@ -318,12 +289,13 @@ void generalSyrImpl( const typename AViewType::execution_space & space singleLevelSyr(space, trans, uplo, alpha, x, A); } -template < class XViewType +template < class ExecutionSpace + , class XViewType , class AViewType , class IndexType - , typename std::enable_if()>::type* = nullptr + , typename std::enable_if()>::type* = nullptr > -void generalSyrImpl( const typename AViewType::execution_space & space +void generalSyrImpl( const ExecutionSpace & space , const char trans[] , const char uplo[] , const typename AViewType::const_value_type & alpha diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 027c31d692..14ee7d1988 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -68,13 +68,14 @@ namespace Impl { // // Implementation of KokkosBlas::syr. -template < class XViewType +template < class ExecutionSpace + , class XViewType , class AViewType , bool tpl_spec_avail = syr_tpl_spec_avail::value , bool eti_spec_avail = syr_eti_spec_avail::value > struct SYR { - static void syr( const typename AViewType::execution_space & space + static void syr( const ExecutionSpace & space , const char trans[] , const char uplo[] , const typename AViewType::const_value_type & alpha @@ -85,50 +86,6 @@ struct SYR { { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering KokkosBlas::Impl::Syr::syr()\n" ); - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - - if ((trans[0] == 'T') || - (trans[0] == 't') || - (trans[0] == 'H') || - (trans[0] == 'h')) { - // Ok - } - else { - std::ostringstream oss; - oss << "In impl of KokkosBlas2::syr(): invalid trans[0] = " << trans[0]; - throw std::runtime_error(oss.str()); - } - - if ((uplo[0] == 'U') || - (uplo[0] == 'u') || - (uplo[0] == 'L') || - (uplo[0] == 'l')) { - // Ok - } - else { - std::ostringstream oss; - oss << "In impl of KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0]; - throw std::runtime_error(oss.str()); - } - - if (A.extent(0) != x.extent(0)) { - std::ostringstream oss; - oss << "In impl of KokkosBlas2::syr(): A.extent(0) = " << A.extent(0) - << ", but x.extent(0) = " << x.extent(0); - throw std::runtime_error(oss.str()); - } - - if (A.extent(1) != x.extent(0)) { - std::ostringstream oss; - oss << "In impl of KokkosBlas2::syr(): A.extent(1) = " << A.extent(1) - << ", but x.extent(0) = " << x.extent(0); - throw std::runtime_error(oss.str()); - } - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr[ETI]" : "KokkosBlas::syr[noETI]"); typedef typename AViewType::size_type size_type; @@ -138,22 +95,22 @@ struct SYR { // Prefer int as the index type, but use a larsyr type if needed. if (( numRows < static_cast(INT_MAX) ) && ( numCols < static_cast(INT_MAX) )) { - generalSyrImpl( space - , trans - , uplo - , alpha - , x - , A - ); + generalSyrImpl( space + , trans + , uplo + , alpha + , x + , A + ); } else { - generalSyrImpl( space - , trans - , uplo - , alpha - , x - , A - ); + generalSyrImpl( space + , trans + , uplo + , alpha + , x + , A + ); } Kokkos::Profiling::popRegion(); @@ -173,7 +130,8 @@ struct SYR { // We may spread out definitions (see _DEF macro below) across one or more .cpp files. // #define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct SYR< Kokkos::View< const SCALAR* \ + extern template struct SYR< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ @@ -188,7 +146,8 @@ struct SYR { >; #define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct SYR< Kokkos::View< const SCALAR* \ + template struct SYR< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 68d30deccd..cf40448675 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -23,8 +23,9 @@ namespace KokkosBlas { /// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. /// -/// \tparam XViewType Input vector, as a 1-D Kokkos::View -/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// \tparam ExecutionSpace The type of execution space +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View /// /// \param space [in] Execution space instance on which to run the kernel. /// This may contain information about which stream to @@ -34,8 +35,8 @@ namespace KokkosBlas { /// \param alpha [in] Input coefficient of x * x^{T,H} /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View -template -void syr( const typename AViewType::execution_space & space +template +void syr( const ExecutionSpace & space , const char trans[] , const char uplo[] , const typename AViewType::const_value_type & alpha @@ -44,6 +45,15 @@ void syr( const typename AViewType::execution_space & space ) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering SRC KokkosBlas::syr(), AViewType = %s\n", typeid(AViewType).name() ); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be compatible with ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be compatible with ExecutionSpace"); + static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); @@ -60,6 +70,33 @@ void syr( const typename AViewType::execution_space & space KokkosKernels::Impl::throw_runtime_exception(os.str()); } + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas2::syr(): invalid trans[0] = '" << trans[0] + << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((uplo[0] == 'U') || + (uplo[0] == 'u') || + (uplo[0] == 'L') || + (uplo[0] == 'l')) { + // Ok + } + else { + std::ostringstream oss; + oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] + << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; + throw std::runtime_error(oss.str()); + } + + if ((A.extent(0) == 0) || (A.extent(1) == 0)) { + return; + } + using ALayout = typename AViewType::array_layout; // Minimize the number of Impl::SYR instantiations, by standardizing @@ -76,29 +113,13 @@ void syr( const typename AViewType::execution_space & space , Kokkos::MemoryTraits > AVT; - if (( A.extent(0) == 0 ) || - ( A.extent(1) == 0 )) { - // For degenerate cases, use fallback implementation to avoid potential - // (unlikely?) circular dependence issues by including other KokkosBlas - // headers. - const bool eti_spec_avail = KokkosBlas::Impl::syr_eti_spec_avail::value; - Impl::SYR::syr( space - , trans - , uplo - , alpha - , x - , A - ); - } - else { - Impl::SYR::syr( space - , trans - , uplo - , alpha - , x - , A - ); - } + Impl::SYR::syr( space + , trans + , uplo + , alpha + , x + , A + ); } /// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. @@ -119,13 +140,13 @@ void syr( const char trans[] , const AViewType & A ) { const typename AViewType::execution_space space = typename AViewType::execution_space(); - syr( space - , trans - , uplo - , alpha - , x - , A - ); + syr( space + , trans + , uplo + , alpha + , x + , A + ); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp index c0317cdbc8..68f02c1aef 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -28,15 +28,15 @@ struct syr_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTX, LAYOUTA, MEMSPACE) \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -44,30 +44,30 @@ struct syr_tpl_spec_avail { enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTX, LAYOUTA, MEMSPACE) \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -75,15 +75,25 @@ KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 7438af7035..7eda8820e4 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -28,23 +28,7 @@ namespace Impl { const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - if (( trans[0] == 'T' ) || \ - ( trans[0] == 't' ) || \ - ( trans[0] == 'H' ) || \ - ( trans[0] == 'h' )) { \ - } \ - else { \ - throw std::runtime_error("Error: invalid 'trans' for HostBlas::syr()"); \ - } \ - if (( uplo[0] == 'U' ) || \ - ( uplo[0] == 'u' ) || \ - ( uplo[0] == 'L' ) || \ - ( uplo[0] == 'l' )) { \ - } \ - else { \ - throw std::runtime_error("Error: invalid 'uplo' for HostBlas::syr()"); \ - } + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); #define KOKKOSBLAS2_DSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ @@ -330,25 +314,25 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 711b757151..37a35f7ef7 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace Test { @@ -264,8 +265,8 @@ void SyrTester< ScalarX typename _ViewTypeX::const_type c_x = x; - _HostViewTypeX h_x = Kokkos::create_mirror_view(x); - _HostViewTypeA h_A = Kokkos::create_mirror_view(A); + _HostViewTypeX h_x = Kokkos::create_mirror(x); + _HostViewTypeA h_A = Kokkos::create_mirror(A); _ViewTypeExpected h_expected("expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; @@ -714,7 +715,7 @@ T SyrTester< ScalarX { T output(input); #if 0 - T twoPi( 2. * piVal ); + T twoPi( 2. * Kokkos::numbers::pi ); if (input > 0.) { output -= std::floor( input / twoPi ) * twoPi; } From fdb0934aed8d8d75a5df9e691b0680ff75395033 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 10:28:08 -0600 Subject: [PATCH 004/231] Backup --- blas/impl/KokkosBlas2_syr_spec.hpp | 45 ++-- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 203 ++++++------------ .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 92 ++++---- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 18 +- 4 files changed, 145 insertions(+), 213 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 14ee7d1988..2b4887a9a3 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -20,9 +20,9 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" -#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +//#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // AquiEEP #include -#endif +//#endif namespace KokkosBlas { namespace Impl { @@ -67,23 +67,18 @@ namespace Impl { // syr // -// Implementation of KokkosBlas::syr. template < class ExecutionSpace , class XViewType , class AViewType - , bool tpl_spec_avail = syr_tpl_spec_avail::value - , bool eti_spec_avail = syr_eti_spec_avail::value > -struct SYR { - static void syr( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) -#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - { +static void kk_syr( const ExecutionSpace & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) +{ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering KokkosBlas::Impl::Syr::syr()\n" ); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr[ETI]" : "KokkosBlas::syr[noETI]"); @@ -114,6 +109,26 @@ struct SYR { } Kokkos::Profiling::popRegion(); +} + +// Implementation of KokkosBlas::syr. +template < class ExecutionSpace + , class XViewType + , class AViewType + , bool tpl_spec_avail = syr_tpl_spec_avail::value + , bool eti_spec_avail = syr_eti_spec_avail::value + > +struct SYR { + static void syr( const ExecutionSpace & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + kk_syr(space, trans, uplo, alpha, x, A); } #else ; diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 7eda8820e4..a7c7b1e6b7 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -22,23 +22,22 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ +#define KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT) \ + bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const double* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< double** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -47,12 +46,12 @@ namespace Impl { > { \ typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -66,40 +65,28 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ - if (A_is_ll) { \ - HostBlas::syr( uplo \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - else { \ - HostBlas::syr( uplo \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_SSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const float* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< float** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -108,12 +95,12 @@ namespace Impl { > { \ typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -127,40 +114,28 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ - if (A_is_ll) { \ - HostBlas::syr( uplo \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - else { \ - HostBlas::syr( uplo \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -169,12 +144,12 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -188,60 +163,37 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ const std::complex alpha_val = static_cast>(alpha); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::syru( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - HostBlas>::syrc( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasZsyru() is not supported."); \ } \ else { \ - if (justTranspose) { \ - HostBlas>::syru( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyrc() requires LayoutLeft: throwing exception\n"); \ - throw std::runtime_error("Error: blasZsyrc() requires LayoutLeft views."); \ - } \ + HostBlas>::syrc( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -250,12 +202,12 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -269,46 +221,23 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ const std::complex alpha_val = static_cast>(alpha); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::syru( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - HostBlas>::syrc( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasCsyru() is not supported"); \ } \ else { \ - if (justTranspose) { \ - HostBlas>::syru( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyrc() requires LayoutLeft: throwing exception\n"); \ - throw std::runtime_error("Error: blasCsyrc() requires LayoutLeft views."); \ - } \ + HostBlas>::syrc( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ } \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index f5bdc9d2c7..b60e5c95ef 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -22,39 +22,23 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - if (( trans[0] == 'T' ) || \ - ( trans[0] == 't' ) || \ - ( trans[0] == 'H' ) || \ - ( trans[0] == 'h' )) { \ - } \ - else { \ - throw std::runtime_error("Error: invalid 'trans' for cudaBlas::syr()"); \ - } \ - if (( uplo[0] == 'U' ) || \ - ( uplo[0] == 'u' ) || \ - ( uplo[0] == 'L' ) || \ - ( uplo[0] == 'l' )) { \ - } \ - else { \ - throw std::runtime_error("Error: invalid 'uplo' for cudaBlas::syr()"); \ - } + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const double* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< double** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -63,12 +47,12 @@ namespace Impl { > { \ typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -82,7 +66,7 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ if (A_is_ll) { \ @@ -114,15 +98,15 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const float* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< float** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -131,12 +115,12 @@ namespace Impl { > { \ typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -150,7 +134,7 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ if (A_is_ll) { \ @@ -182,15 +166,15 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -199,12 +183,12 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -218,7 +202,7 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ @@ -271,15 +255,15 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUTX, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -288,12 +272,12 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -307,7 +291,7 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ @@ -365,21 +349,41 @@ KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::Layout KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index fe29211161..4561cc2737 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -28,23 +28,7 @@ namespace Impl { const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - if (( trans[0] == 'T' ) || \ - ( trans[0] == 't' ) || \ - ( trans[0] == 'H' ) || \ - ( trans[0] == 'h' )) { \ - } \ - else { \ - throw std::runtime_error( "Error: invalid 'trans' for rocBlas::syr()"); \ - } \ - if (( uplo[0] == 'U' ) || \ - ( uplo[0] == 'u' ) || \ - ( uplo[0] == 'L' ) || \ - ( uplo[0] == 'l' )) { \ - } \ - else { \ - throw std::runtime_error( "Error: invalid 'uplo' for rocBlas::syr()"); \ - } + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); #define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ From 660f6781c5c748ebaaa66b49d9bddb4b32d0e4cd Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 11:38:21 -0600 Subject: [PATCH 005/231] Backup --- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 241 ++++++------------ .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 179 ++++--------- 2 files changed, 127 insertions(+), 293 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index b60e5c95ef..92bf0983e1 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -23,7 +23,6 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT) \ - bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ @@ -69,30 +68,16 @@ namespace Impl { KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ Kokkos::Profiling::popRegion(); \ } \ @@ -137,30 +122,16 @@ namespace Impl { KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ Kokkos::Profiling::popRegion(); \ } \ @@ -206,49 +177,22 @@ namespace Impl { bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: cublasZsyru() is not supported."); \ } \ else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyrc() requires LayoutLeft: throwing exception\n"); \ - throw std::runtime_error("Error: cublasZsyrc() requires LayoutLeft views."); \ - } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ Kokkos::Profiling::popRegion(); \ @@ -295,94 +239,67 @@ namespace Impl { bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: cublasCsyru() is not supported."); \ } \ else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCsyrc() requires LayoutLeft: throwing exception\n"); \ - throw std::runtime_error("Error: cublasCsyrc() requires LayoutLeft views."); \ - } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 4561cc2737..fef9a0dcbe 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -23,7 +23,6 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ - bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ @@ -69,30 +68,16 @@ namespace Impl { KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ Kokkos::Profiling::popRegion(); \ } \ @@ -142,30 +127,16 @@ namespace Impl { KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ Kokkos::Profiling::popRegion(); \ } \ @@ -216,49 +187,22 @@ namespace Impl { bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: rocblasZsyru() is not supported."); \ } \ else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZsyrc() requires LayoutLeft: throwing exception\n"); \ - throw std::runtime_error("Error: rocblasZsyrc() requires LayoutLeft views."); \ - } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ @@ -310,49 +254,22 @@ namespace Impl { bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: rocblasCsyru() is not supported."); \ } \ else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyru( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCsyrc() requires LayoutLeft: throwing exception\n"); \ - throw std::runtime_error("Error: rocblasCgec() requires LayoutLeft views."); \ - } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ From d79e2f952e1176ed6fa3a3614917bddf71af580b Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 15:24:53 -0600 Subject: [PATCH 006/231] Correct usage of 'assignable' --- blas/src/KokkosBlas2_syr.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index cf40448675..77f6a17468 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -45,14 +45,18 @@ void syr( const ExecutionSpace & space ) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering SRC KokkosBlas::syr(), AViewType = %s\n", typeid(AViewType).name() ); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "AViewType memory space must be assignable from XViewType"); static_assert( Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be compatible with ExecutionSpace"); + "AViewType memory space must be accessible from the ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be compatible with ExecutionSpace"); + "XViewType memory space must be accessible from the ExecutionSpace"); static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); From e7fcac91b570125167323c90e00743ee53e2dec1 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 12 Apr 2023 05:34:49 -0600 Subject: [PATCH 007/231] Backup --- blas/src/KokkosBlas2_syr.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 77f6a17468..57bcdb71a8 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -52,11 +52,11 @@ void syr( const ExecutionSpace & space static_assert( Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be accessible from the ExecutionSpace"); + "AViewType memory space must be accessible from ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be accessible from the ExecutionSpace"); + "XViewType memory space must be accessible from ExecutionSpace"); static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); From bb6bb00e2efb9d04a12d73108358fb2860670840 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 24 Apr 2023 23:43:56 -0600 Subject: [PATCH 008/231] Backup --- blas/impl/KokkosBlas2_syr_spec.hpp | 9 +- blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp | 171 ++++---- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 408 ++++++++++-------- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 336 ++++++++------- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 313 +++++++------- 5 files changed, 636 insertions(+), 601 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 2b4887a9a3..91906dcd5a 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -27,7 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct syr_eti_spec_avail { enum : bool { value = false }; }; @@ -42,7 +42,8 @@ struct syr_eti_spec_avail { // #define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ - struct syr_eti_spec_avail< Kokkos::View< const SCALAR* \ + struct syr_eti_spec_avail< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ @@ -115,8 +116,8 @@ static void kk_syr( const ExecutionSpace & space template < class ExecutionSpace , class XViewType , class AViewType - , bool tpl_spec_avail = syr_tpl_spec_avail::value - , bool eti_spec_avail = syr_eti_spec_avail::value + , bool tpl_spec_avail = syr_tpl_spec_avail::value + , bool eti_spec_avail = syr_eti_spec_avail::value > struct SYR { static void syr( const ExecutionSpace & space diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp index 68f02c1aef..2be20a44af 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct syr_tpl_spec_avail { enum : bool { value = false }; }; @@ -28,107 +28,120 @@ struct syr_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +#endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template <> \ - struct syr_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device< Kokkos::Experimental::HIP, \ - , Kokkos::Experimental::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device< Kokkos::Experimental::HIP \ - , Kokkos::Experimental::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex , Kokkos::LayoutRight) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex , Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index a7c7b1e6b7..925890b673 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -29,186 +29,131 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & /* space */ \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - HostBlas::syr( uplo \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const double* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & /*space*/ \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & /* space */ \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - HostBlas::syr( uplo \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const float* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & /*space*/ \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + HostBlas::syr( uplo \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & /* space */ \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasZsyru() is not supported."); \ - } \ - else { \ - HostBlas>::syrc( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const Kokkos::complex* \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ - , Kokkos::View< Kokkos::complex** \ + , Kokkos::View< Kokkos::complex** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ , ETI_SPEC_AVAIL \ > { \ - typedef Kokkos::complex SCALAR; \ + typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -219,49 +164,132 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = static_cast>(alpha); \ + const std::complex alpha_val = static_cast>(alpha); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasCsyru() is not supported"); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasZsyru() is not supported."); \ } \ else { \ - HostBlas>::syrc( uplo \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ + HostBlas>::syrc( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasCsyru() is not supported"); \ + } \ + else { \ + HostBlas>::syrc( uplo \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +#endif } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index 92bf0983e1..a7a1e0c6c2 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -29,16 +29,17 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const double* \ +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const double* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< double** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -47,12 +48,12 @@ namespace Impl { typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -83,16 +84,17 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const float* \ +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const float* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< float** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -101,12 +103,12 @@ namespace Impl { typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -137,169 +139,171 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-cublas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: cublasZsyru() is not supported."); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: cublasZsyru() is not supported."); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-cublas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: cublasCsyru() is not supported."); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: cublasCsyru() is not supported."); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index fef9a0dcbe..38870efda7 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -29,16 +29,17 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ - struct SYR< Kokkos::View< const double* \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const double* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< double** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -47,12 +48,12 @@ namespace Impl { typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -83,21 +84,17 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ - struct SYR< Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ + struct SYR< EXEC_SPACE \ , Kokkos::View< const float* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< float** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -106,12 +103,12 @@ namespace Impl { typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -142,159 +139,151 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-rocblas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: rocblasZsyru() is not supported."); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: rocblasZsyru() is not supported."); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-rocblas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: rocblasCsyru() is not supported."); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: rocblasCsyru() is not supported."); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyrc( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas From fdfe6128a8e0d5da467a503daea7f1707bae331a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 8 May 2023 15:07:58 -0600 Subject: [PATCH 009/231] Backup --- blas/tpls/KokkosBlas_Host_tpl.cpp | 58 ++++++++++++------------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 44fc134434..329aa354e0 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -943,20 +943,14 @@ void HostBlas >::syru( const char uplo , std::complex* a , int lda ) { -#if 0 // AquiEEP - std::string trans("T"); - F77_FUNC_CGERU( &trans[0] - , &n + F77_FUNC_CSYRU( &uplo // AquiEEP , &n , &alpha , (const std::complex*)x , &incx - , (const std::complex*)x - , &incx , (std::complex*)a , &lda ); -#endif } template <> void HostBlas >::syrc( const char uplo @@ -967,14 +961,14 @@ void HostBlas >::syrc( const char uplo , std::complex* a , int lda ) { - F77_FUNC_CHER( &uplo - , &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); + F77_FUNC_CSYRC( &uplo // AquiEEP + , &n + , &alpha + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); } template <> void HostBlas >::trsv(const char uplo, const char transa, @@ -1147,7 +1141,7 @@ void HostBlas >::gerc( (std::complex*)a, &lda); } template <> -void HostBlas >::syru( const char uplo +void HostBlas >::syru( const char /*uplo*/ , int n , const std::complex alpha , const std::complex* x @@ -1155,23 +1149,17 @@ void HostBlas >::syru( const char uplo , std::complex* a , int lda ) { -#if 0 // AquiEEP - std::string trans("T"); - F77_FUNC_ZGERU( &trans[0] - , &n - , &n + F77_FUNC_ZSYRU( /*&uplo,*/ // AquiEEP + &n , &alpha - , (const std::complex*)x - , &incx - , (const std::complex*)x + , (const std::complex*)x , &incx - , (std::complex*)a + , (std::complex*)a , &lda ); -#endif } template <> -void HostBlas >::syrc( const char uplo +void HostBlas >::syrc( const char /*uplo*/ , int n , const std::complex alpha , const std::complex* x @@ -1179,14 +1167,14 @@ void HostBlas >::syrc( const char uplo , std::complex* a , int lda ) { - F77_FUNC_ZHER( &uplo - , &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); + F77_FUNC_ZSYRC( /*&uplo,*/ // AquiEEP + &n + , &alpha + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); } template <> void HostBlas >::trsv(const char uplo, const char transa, From 5bca24c0b91b6ce68e2c1ed1b90627666f3bb0f0 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 8 May 2023 19:55:23 -0600 Subject: [PATCH 010/231] Backup --- blas/src/KokkosBlas1_swap.hpp | 2 + .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 16 +- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 167 +++++++++--------- blas/unit_test/Test_Blas2_syr.hpp | 71 ++++---- 4 files changed, 127 insertions(+), 129 deletions(-) diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index f91d090cd5..fa0441a0fe 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -93,8 +93,10 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { Kokkos::Profiling::pushRegion("KokkosBlas::swap"); // If X.extent(0) == 0, do nothing if (X.extent(0) != 0) { +#if 0 // AquiEEP Impl::Swap::swap(space, X, Y); +#endif } Kokkos::Profiling::popRegion(); } diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 925890b673..a65c83b179 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -58,7 +58,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & /*space*/ \ - , const char trans[] \ + , const char /*trans[]*/ \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -67,7 +67,7 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - HostBlas::syr( uplo \ + HostBlas::syr( uplo[0] \ , N \ , alpha \ , X.data() \ @@ -108,7 +108,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & /*space*/ \ - , const char trans[] \ + , const char /*trans[]*/ \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -117,7 +117,7 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - HostBlas::syr( uplo \ + HostBlas::syr( uplo[0] \ , N \ , alpha \ , X.data() \ @@ -157,7 +157,7 @@ namespace Impl { , Kokkos::MemoryTraits \ > AViewType; \ \ - static void syr( const typename AViewType::execution_space & /* space */ \ + static void syr( const typename AViewType::execution_space & space \ , const char trans[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ @@ -175,7 +175,7 @@ namespace Impl { throw std::runtime_error("Error: blasZsyru() is not supported."); \ } \ else { \ - HostBlas>::syrc( uplo \ + HostBlas>::syrc( uplo[0] \ , N \ , alpha_val \ , reinterpret_cast*>(X.data()) \ @@ -216,7 +216,7 @@ namespace Impl { , Kokkos::MemoryTraits \ > AViewType; \ \ - static void syr( const typename AViewType::execution_space & /* space */ \ + static void syr( const typename AViewType::execution_space & space \ , const char trans[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ @@ -234,7 +234,7 @@ namespace Impl { throw std::runtime_error("Error: blasCsyru() is not supported"); \ } \ else { \ - HostBlas>::syrc( uplo \ + HostBlas>::syrc( uplo[0] \ , N \ , alpha_val \ , reinterpret_cast*>(X.data()) \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index a7a1e0c6c2..d363a70f68 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -22,12 +22,13 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT) \ +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; #define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ @@ -48,7 +49,7 @@ namespace Impl { typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ @@ -58,7 +59,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ + , const char /*trans[]*/ \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -66,11 +67,11 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ - , uplo \ + , fillMode \ , N \ , &alpha \ , X.data() \ @@ -103,7 +104,7 @@ namespace Impl { typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ @@ -113,7 +114,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ + , const char /*trans[]*/ \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -121,11 +122,11 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ - , uplo \ + , fillMode \ , N \ , &alpha \ , X.data() \ @@ -139,73 +140,73 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-cublas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: cublasZsyru() is not supported."); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: cublasZsyru() is not supported."); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyr( s.handle /*AquiEEP*/ \ + , fillMode \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ + , Kokkos::View< const Kokkos::complex* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ @@ -221,7 +222,7 @@ namespace Impl { typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< SCALAR** \ @@ -239,7 +240,7 @@ namespace Impl { ) { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ @@ -249,15 +250,15 @@ namespace Impl { throw std::runtime_error("Error: cublasCsyru() is not supported."); \ } \ else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyr( s.handle /*AquiEEP*/ \ + , fillMode \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ ); \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 37a35f7ef7..d3e9dbf53d 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -43,9 +43,9 @@ class SyrTester typedef Kokkos::View _ViewTypeX; typedef Kokkos::View _ViewTypeA; - typedef typename _ViewTypeX::HostMirror _HostViewTypeX; - typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View _ViewTypeExpected; + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef Kokkos::View _ViewTypeExpected; typedef Kokkos::ArithTraits _KAT_A; typedef typename _KAT_A::mag_type _AuxType; @@ -260,15 +260,10 @@ void SyrTester< ScalarX test_cx = true; } - _ViewTypeX x("X", _M); - _ViewTypeA A("A", _M, _N); + view_stride_adapter<_ViewTypeX, false> x("X", _M); + view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - typename _ViewTypeX::const_type c_x = x; - - _HostViewTypeX h_x = Kokkos::create_mirror(x); - _HostViewTypeA h_A = Kokkos::create_mirror(A); - - _ViewTypeExpected h_expected("expected A += alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(0.); @@ -277,23 +272,23 @@ void SyrTester< ScalarX // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A // ******************************************************************** this->populateVariables( alpha - , h_x - , h_A - , h_expected - , x - , A + , x.h_view + , A.h_view + , h_expected.d_view + , x.d_view + , A.d_view , expectedResultIsKnown ); // ******************************************************************** // Step 3 of 7: populate h_vanilla // ******************************************************************** - _ViewTypeExpected h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name() ); this->populateVanillaValues( alpha - , h_x - , h_A - , h_vanilla + , x.h_view + , A.h_view + , h_vanilla.d_view ); // ******************************************************************** @@ -304,29 +299,29 @@ void SyrTester< ScalarX // Compare h_vanilla against h_expected // ****************************************************************** this->compareVanillaExpected( alpha - , h_vanilla - , h_expected + , h_vanilla.d_view + , h_expected.d_view ); } else { // ****************************************************************** // Copy h_vanilla to h_expected // ****************************************************************** - Kokkos::deep_copy(h_expected, h_vanilla); + Kokkos::deep_copy(h_expected.d_base, h_vanilla.d_base); } // ******************************************************************** // Step 5 of 7: test with 'non const x' // ******************************************************************** - _ViewTypeA org_A("Org_A", _M, _N); - Kokkos::deep_copy(org_A, A); + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); // AquiEEP (see ger as well) + Kokkos::deep_copy(org_A.d_base, A.d_base); if (test_x) { this->callKkSyrAndCompareAgainstExpected( alpha - , x - , A - , h_A - , h_expected + , x.d_view + , A.d_view + , A.h_view + , h_expected.d_view , "non const x" ); } @@ -335,13 +330,13 @@ void SyrTester< ScalarX // Step 6 of 7: test with const x // ******************************************************************** if (test_cx) { - Kokkos::deep_copy(A, org_A); + Kokkos::deep_copy(A.d_base, org_A.d_base); this->callKkSyrAndCompareAgainstExpected( alpha - , c_x - , A - , h_A - , h_expected + , x.d_view_const + , A.d_view + , A.h_view + , h_expected.d_view , "const x" ); } @@ -349,10 +344,10 @@ void SyrTester< ScalarX // ******************************************************************** // Step 7 of 7: tests with invalid values on the first input parameter // ******************************************************************** - EXPECT_ANY_THROW( KokkosBlas::syr(".", "U", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for mode '.'"; - EXPECT_ANY_THROW( KokkosBlas::syr("", "U", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for mode ''"; - EXPECT_ANY_THROW( KokkosBlas::syr("T", ".", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for uplo '.'"; - EXPECT_ANY_THROW( KokkosBlas::syr("T", "", alpha, x, A) ) << "Failed test: kk syr should have thrown an exception for uplo ''"; + EXPECT_ANY_THROW( KokkosBlas::syr(".", "U", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW( KokkosBlas::syr( "", "U", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for mode ''"; + EXPECT_ANY_THROW( KokkosBlas::syr("T", ".", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for uplo '.'"; + EXPECT_ANY_THROW( KokkosBlas::syr("T", "", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for uplo ''"; std::cout << "Leaving SyrTester::test() - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; } From b1983978b8c5021f88abbf15d4231c417bf69a2e Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 8 May 2023 22:39:54 -0600 Subject: [PATCH 011/231] Backup --- blas/src/KokkosBlas1_swap.hpp | 2 +- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 31 ++++++------------- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 4 +-- blas/tpls/KokkosBlas_Host_tpl.cpp | 20 +++++++++--- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index fa0441a0fe..ea864cc3b4 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -41,7 +41,7 @@ namespace KokkosBlas { /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking template -void swap(execution_space const& space, XVector const& x, YVector const& y) { +void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) { // AquiEEP // Assert properties of XVector static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index a65c83b179..691f78f89a 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -26,7 +26,6 @@ namespace Impl { bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); #define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ @@ -58,7 +57,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & /*space*/ \ - , const char /*trans[]*/ \ + , const char /*trans*/[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -67,6 +66,7 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + constexpr int one = 1; \ HostBlas::syr( uplo[0] \ , N \ , alpha \ @@ -108,7 +108,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & /*space*/ \ - , const char /*trans[]*/ \ + , const char /*trans*/[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -117,6 +117,7 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + constexpr int one = 1; \ HostBlas::syr( uplo[0] \ , N \ , alpha \ @@ -167,7 +168,6 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = static_cast>(alpha); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ kk_syr( space, trans, uplo, alpha, X, A); \ @@ -175,14 +175,9 @@ namespace Impl { throw std::runtime_error("Error: blasZsyru() is not supported."); \ } \ else { \ - HostBlas>::syrc( uplo[0] \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyrc() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasZsyrc() is not supported."); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -226,7 +221,6 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = static_cast>(alpha); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ kk_syr( space, trans, uplo, alpha, X, A); \ @@ -234,14 +228,9 @@ namespace Impl { throw std::runtime_error("Error: blasCsyru() is not supported"); \ } \ else { \ - HostBlas>::syrc( uplo[0] \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyrc() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasCsyrc() is not supported"); \ } \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index d363a70f68..8b28e7ad44 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -59,7 +59,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & space \ - , const char /*trans[]*/ \ + , const char /*trans*/[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -114,7 +114,7 @@ namespace Impl { > AViewType; \ \ static void syr( const typename AViewType::execution_space & space \ - , const char /*trans[]*/ \ + , const char /*trans*/[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 329aa354e0..f9318e128a 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -268,6 +268,7 @@ void F77_BLAS_MANGLE(dsyr, DSYR)( const char* , double* , int* ); +#if 0 // AquiEEP void F77_BLAS_MANGLE(csyru, CSYRU)( const char* , int* , const std::complex* @@ -298,6 +299,7 @@ void F77_BLAS_MANGLE(zsyrc, ZSYRC)( int* , std::complex* , int* ); +#endif /// /// Trsv @@ -522,10 +524,12 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_SSYR F77_BLAS_MANGLE(ssyr, SSYR) #define F77_FUNC_DSYR F77_BLAS_MANGLE(dsyr, DSYR) +#if 0 // AquiEEP #define F77_FUNC_CSYRU F77_BLAS_MANGLE(csyru, CSYRU) #define F77_FUNC_CSYRC F77_BLAS_MANGLE(csyrc, CSYRC) #define F77_FUNC_ZSYRU F77_BLAS_MANGLE(zsyru, ZSYRU) #define F77_FUNC_ZSYRC F77_BLAS_MANGLE(zsyrc, ZSYRC) +#endif #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) @@ -934,6 +938,7 @@ void HostBlas >::gerc( (const std::complex*)y, &incy, (std::complex*)a, &lda); } +#if 0 // AquiEEP template <> void HostBlas >::syru( const char uplo , int n @@ -943,7 +948,7 @@ void HostBlas >::syru( const char uplo , std::complex* a , int lda ) { - F77_FUNC_CSYRU( &uplo // AquiEEP + F77_FUNC_CSYRU( &uplo , &n , &alpha , (const std::complex*)x @@ -952,6 +957,8 @@ void HostBlas >::syru( const char uplo , &lda ); } +#endif +#if 0 // AquiEEP template <> void HostBlas >::syrc( const char uplo , int n @@ -961,7 +968,7 @@ void HostBlas >::syrc( const char uplo , std::complex* a , int lda ) { - F77_FUNC_CSYRC( &uplo // AquiEEP + F77_FUNC_CSYRC( &uplo , &n , &alpha , (const std::complex*)x @@ -970,6 +977,7 @@ void HostBlas >::syrc( const char uplo , &lda ); } +#endif template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, @@ -1140,6 +1148,7 @@ void HostBlas >::gerc( (const std::complex*)y, &incy, (std::complex*)a, &lda); } +#if 0 // AquiEEP template <> void HostBlas >::syru( const char /*uplo*/ , int n @@ -1149,7 +1158,7 @@ void HostBlas >::syru( const char /*uplo*/ , std::complex* a , int lda ) { - F77_FUNC_ZSYRU( /*&uplo,*/ // AquiEEP + F77_FUNC_ZSYRU( /*&uplo,*/ &n , &alpha , (const std::complex*)x @@ -1158,6 +1167,8 @@ void HostBlas >::syru( const char /*uplo*/ , &lda ); } +#endif +#if 0 // AquiEEP template <> void HostBlas >::syrc( const char /*uplo*/ , int n @@ -1167,7 +1178,7 @@ void HostBlas >::syrc( const char /*uplo*/ , std::complex* a , int lda ) { - F77_FUNC_ZSYRC( /*&uplo,*/ // AquiEEP + F77_FUNC_ZSYRC( /*&uplo,*/ &n , &alpha , (const std::complex*)x @@ -1176,6 +1187,7 @@ void HostBlas >::syrc( const char /*uplo*/ , &lda ); } +#endif template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, From d19267e22160968821f9971cd189a8565ae37402 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 9 May 2023 00:33:47 -0600 Subject: [PATCH 012/231] Backup --- batched/KokkosBatched_Util.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 27fb0bf338..14ac71f1cb 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -638,7 +638,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, #else template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::ALL_t i2, Kokkos::ALL_t i3, + Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); @@ -681,7 +681,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( #else template KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, + ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); From 4770990b39bff083b527f8eb5d02124a168a6ed8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 9 May 2023 09:09:18 -0600 Subject: [PATCH 013/231] Backup --- batched/KokkosBatched_Util.hpp | 4 +-- blas/unit_test/Test_Blas2_syr.hpp | 59 ++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 14ac71f1cb..27fb0bf338 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -638,7 +638,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, #else template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, + Kokkos::ALL_t i2, Kokkos::ALL_t i3, const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); @@ -681,7 +681,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( #else template KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, + ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index d3e9dbf53d..a3030b1189 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -1390,6 +1390,16 @@ int test_syr( const std::string & caseName ) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s ...\n", caseName.c_str() ); + bool xBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool aBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool useAnalyticalResults = xBool && aBool; + #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); @@ -1402,15 +1412,28 @@ int test_syr( const std::string & caseName ) { tester.test(2, 0); tester.test(13, 0); tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0 , true, false, false); //tester.test(1024, 0 , true, false, true); //tester.test(1024, 0 , true, true, false); //tester.test(1024, 0 , true, true, true); + } + else { + //tester.test(1024, 0 , false, false, true); + //tester.test(1024, 0 , false, true, false); + //tester.test(1024, 0 , false, true, true); + } tester.test(50, 4 ); tester.test(1024, 0); tester.test(2131, 0); + if (useAnalyticalResults) { //tester.test(2131, 0 , true, false, true); //tester.test(2131, 0 , true, true, true); + } + else { + //tester.test(2131, 0 , false, false, true); + //tester.test(2131, 0 , false, true, true); + } } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTLEFT\n", caseName.c_str() ); @@ -1429,15 +1452,28 @@ int test_syr( const std::string & caseName ) { tester.test(2, 0); tester.test(13, 0); tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0, true, false, false); //tester.test(1024, 0, true, false, true); //tester.test(1024, 0, true, true, false); //tester.test(1024, 0, true, true, true); + } + else { + //tester.test(1024, 0, false, false, true); + //tester.test(1024, 0, false, true, false); + //tester.test(1024, 0, false, true, true); + } tester.test(50, 4); tester.test(1024, 0); tester.test(2131, 0); + if (useAnalyticalResults) { //tester.test(2131, 0, true, false, true); //tester.test(2131, 0, true, true, true); + } + else { + //tester.test(2131, 0, false, false, true); + //tester.test(2131, 0, false, true, true); + } } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTRIGHT\n", caseName.c_str() ); @@ -1454,15 +1490,28 @@ int test_syr( const std::string & caseName ) { tester.test(0, 0); tester.test(13, 0); tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0, true, false, false); //tester.test(1024, 0, true, false, true); //tester.test(1024, 0, true, true, false); //tester.test(1024, 0, true, true, true); + } + else { + //tester.test(1024, 0, false, false, true); + //tester.test(1024, 0, false, true, false); + //tester.test(1024, 0, false, true, true); + } tester.test(50, 4); tester.test(1024, 0); tester.test(2131, 0); + if (useAnalyticalResults) { //tester.test(2131, 0, true, false, true); //tester.test(2131, 0, true, true, true); + } + else { + //tester.test(2131, 0, false, false, true); + //tester.test(2131, 0, false, true, true); + } } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTSTRIDE\n", caseName.c_str() ); @@ -1476,8 +1525,14 @@ int test_syr( const std::string & caseName ) { if (true) { Test::SyrTester tester; tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0, true, false, true); //tester.test(1024, 0, true, true, true); + } + else { + //tester.test(1024, 0, false, false, true); + //tester.test(1024, 0, false, true, true); + } } if (true) { @@ -1504,8 +1559,6 @@ TEST_F(TestCategory, syr_float) { } #endif -#if 1 - #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_float) { @@ -1550,5 +1603,3 @@ TEST_F(TestCategory, syr_double_int) { Kokkos::Profiling::popRegion(); } #endif - -#endif // if 1 From 4a433cfaf237a45bdcb6d72ac9383f225476c06f Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 15 May 2023 16:39:42 -0600 Subject: [PATCH 014/231] Backup --- blas/src/KokkosBlas2_syr.hpp | 1 + .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 204 +++++++++--------- blas/tpls/KokkosBlas_Host_tpl.cpp | 155 +++++-------- 3 files changed, 153 insertions(+), 207 deletions(-) diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 57bcdb71a8..9878ede263 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -49,6 +49,7 @@ void syr( const ExecutionSpace & space Kokkos::SpaceAccessibility::assignable, "AViewType memory space must be assignable from XViewType"); + static_assert( Kokkos::SpaceAccessibility::accessible, diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 691f78f89a..1e906575fa 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -130,110 +130,110 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasZsyru() is not supported."); \ - } \ - else { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyrc() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasZsyrc() is not supported."); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasZsyru() is not supported."); \ + } \ + else { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyrc() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasZsyrc() is not supported."); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasCsyru() is not supported"); \ - } \ - else { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyrc() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasCsyrc() is not supported"); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyru() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasCsyru() is not supported"); \ + } \ + else { \ + kk_syr( space, trans, uplo, alpha, X, A); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyrc() is not supported\n"); /* AquiEPP */ \ + throw std::runtime_error("Error: blasCsyrc() is not supported"); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index f9318e128a..1f170a89be 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -236,14 +236,14 @@ void F77_BLAS_MANGLE(cgeru, CGERU)(int*, int*, const std::complex*, const std::complex*, int*, const std::complex*, int*, std::complex*, int*); -void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); void F77_BLAS_MANGLE(zgeru, ZGERU)(int*, int*, const std::complex*, const std::complex*, int*, const std::complex*, int*, std::complex*, int*); +void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, const std::complex*, int*, const std::complex*, int*, @@ -268,38 +268,30 @@ void F77_BLAS_MANGLE(dsyr, DSYR)( const char* , double* , int* ); -#if 0 // AquiEEP -void F77_BLAS_MANGLE(csyru, CSYRU)( const char* - , int* - , const std::complex* - , const std::complex* - , int* - , std::complex* - , int* - ); -void F77_BLAS_MANGLE(csyrc, CSYRC)( const char* - , int* - , const std::complex* - , const std::complex* - , int* - , std::complex* - , int* - ); -void F77_BLAS_MANGLE(zsyru, ZSYRU)( int* - , const std::complex* - , const std::complex* - , int* - , std::complex* - , int* - ); -void F77_BLAS_MANGLE(zsyrc, ZSYRC)( int* - , const std::complex* - , const std::complex* - , int* - , std::complex* - , int* - ); -#endif +// Although there is a cgeru, there is no csyru +// Although there is a zgeru, there is no zsyru +// Although there is a cgerc, there is no csyrc, but there is cher (see below) +// Although there is a zgerc, there is no zsyrc, but there is zher (see below) + +/// +/// Her +/// + +void F77_BLAS_MANGLE(cher, CHER)( const char* + , int* + , const std::complex* + , const std::complex* + , int* + , std::complex* + , int* + ); +void F77_BLAS_MANGLE(zher, ZHER)( int* + , const std::complex* + , const std::complex* + , int* + , std::complex* + , int* + ); /// /// Trsv @@ -518,18 +510,15 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_SGER F77_BLAS_MANGLE(sger, SGER) #define F77_FUNC_DGER F77_BLAS_MANGLE(dger, DGER) #define F77_FUNC_CGERU F77_BLAS_MANGLE(cgeru, CGERU) -#define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) #define F77_FUNC_ZGERU F77_BLAS_MANGLE(zgeru, ZGERU) +#define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) #define F77_FUNC_ZGERC F77_BLAS_MANGLE(zgerc, ZGERC) #define F77_FUNC_SSYR F77_BLAS_MANGLE(ssyr, SSYR) #define F77_FUNC_DSYR F77_BLAS_MANGLE(dsyr, DSYR) -#if 0 // AquiEEP -#define F77_FUNC_CSYRU F77_BLAS_MANGLE(csyru, CSYRU) -#define F77_FUNC_CSYRC F77_BLAS_MANGLE(csyrc, CSYRC) -#define F77_FUNC_ZSYRU F77_BLAS_MANGLE(zsyru, ZSYRU) -#define F77_FUNC_ZSYRC F77_BLAS_MANGLE(zsyrc, ZSYRC) -#endif + +#define F77_FUNC_CHER F77_BLAS_MANGLE(cher, CHER) +#define F77_FUNC_ZHER F77_BLAS_MANGLE(zher, ZHER) #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) @@ -938,29 +927,8 @@ void HostBlas >::gerc( (const std::complex*)y, &incy, (std::complex*)a, &lda); } -#if 0 // AquiEEP -template <> -void HostBlas >::syru( const char uplo - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , std::complex* a - , int lda - ) { - F77_FUNC_CSYRU( &uplo - , &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); -} -#endif -#if 0 // AquiEEP template <> -void HostBlas >::syrc( const char uplo +void HostBlas >::cher( const char uplo , int n , const std::complex alpha , const std::complex* x @@ -968,16 +936,15 @@ void HostBlas >::syrc( const char uplo , std::complex* a , int lda ) { - F77_FUNC_CSYRC( &uplo - , &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); + F77_FUNC_CHER( &uplo + , &n + , &alpha + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); } -#endif template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, @@ -1148,29 +1115,8 @@ void HostBlas >::gerc( (const std::complex*)y, &incy, (std::complex*)a, &lda); } -#if 0 // AquiEEP -template <> -void HostBlas >::syru( const char /*uplo*/ - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , std::complex* a - , int lda - ) { - F77_FUNC_ZSYRU( /*&uplo,*/ - &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); -} -#endif -#if 0 // AquiEEP template <> -void HostBlas >::syrc( const char /*uplo*/ +void HostBlas >::zher( const char uplo , int n , const std::complex alpha , const std::complex* x @@ -1178,16 +1124,15 @@ void HostBlas >::syrc( const char /*uplo*/ , std::complex* a , int lda ) { - F77_FUNC_ZSYRC( /*&uplo,*/ - &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); + F77_FUNC_ZHER( &uplo, + , &n + , &alpha + , (const std::complex*)x + , &incx + , (std::complex*)a + , &lda + ); } -#endif template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, From e24b1ed3e39deda0ff2e9e2cc8a99b37a50a4269 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 15 May 2023 18:49:57 -0600 Subject: [PATCH 015/231] Backup --- blas/tpls/KokkosBlas_Host_tpl.cpp | 5 +++-- blas/tpls/KokkosBlas_Host_tpl.hpp | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 1f170a89be..6dbe6e84bb 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -285,7 +285,8 @@ void F77_BLAS_MANGLE(cher, CHER)( const char* , std::complex* , int* ); -void F77_BLAS_MANGLE(zher, ZHER)( int* +void F77_BLAS_MANGLE(zher, ZHER)( const char* + , int* , const std::complex* , const std::complex* , int* @@ -1124,7 +1125,7 @@ void HostBlas >::zher( const char uplo , std::complex* a , int lda ) { - F77_FUNC_ZHER( &uplo, + F77_FUNC_ZHER( &uplo , &n , &alpha , (const std::complex*)x diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 5772ada279..d06406d764 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -82,7 +82,7 @@ struct HostBlas { , int lda ); - static void syru( const char uplo + static void cher( const char uplo , int n , const T alpha , const T* x @@ -91,7 +91,7 @@ struct HostBlas { , int lda ); - static void syrc( const char uplo + static void zher( const char uplo , int n , const T alpha , const T* x From c1463cccc45f68ab4419a3e2ca0bf5881e9173b6 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 15 May 2023 21:33:18 -0600 Subject: [PATCH 016/231] Backup --- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 217 +++++++++--------- 1 file changed, 113 insertions(+), 104 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 1e906575fa..dd355c9c95 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -26,6 +26,7 @@ namespace Impl { bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); #define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ @@ -66,7 +67,6 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - constexpr int one = 1; \ HostBlas::syr( uplo[0] \ , N \ , alpha \ @@ -117,7 +117,6 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - constexpr int one = 1; \ HostBlas::syr( uplo[0] \ , N \ , alpha \ @@ -130,110 +129,120 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasZsyru() is not supported."); \ - } \ - else { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZsyrc() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasZsyrc() is not supported."); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("No blasZsyr(); calling kk_syr\n"); /*AquiEPP*/ \ + kk_syr( space, trans, uplo, alpha, X, A); \ + } \ + else { \ + const std::complex alpha_val = static_cast>(alpha); \ + HostBlas>::zher( uplo[0] \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasCsyru() is not supported"); \ - } \ - else { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCsyrc() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: blasCsyrc() is not supported"); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("No blasCsyr(); calling kk_syr\n"); /*AquiEPP*/ \ + kk_syr( space, trans, uplo, alpha, X, A); \ + } \ + else { \ + const std::complex alpha_val = static_cast>(alpha); \ + HostBlas>::cher( uplo[0] \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL From 8942c98458e498d0a964dd109e2e57294edaa240 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 00:38:33 -0600 Subject: [PATCH 017/231] Backup --- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 113 +++++++++------ blas/unit_test/Test_Blas2_syr.hpp | 131 +++++++++++++----- 2 files changed, 173 insertions(+), 71 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index dd355c9c95..6abf8ea9ca 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -23,6 +23,7 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ @@ -57,8 +58,8 @@ namespace Impl { , Kokkos::MemoryTraits \ > AViewType; \ \ - static void syr( const typename AViewType::execution_space & /*space*/ \ - , const char /*trans*/[] \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -67,14 +68,21 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - HostBlas::syr( uplo[0] \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ + if (A_is_ll) { \ + HostBlas::syr( uplo[0] \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + else { \ + /* blasDsyr() + ~A_ll => call kk_syr() */ \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasDsyr() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -107,8 +115,8 @@ namespace Impl { , Kokkos::MemoryTraits \ > AViewType; \ \ - static void syr( const typename AViewType::execution_space & /*space*/ \ - , const char /*trans*/[] \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ , const char uplo[] \ , typename AViewType::const_value_type & alpha \ , const XViewType & X \ @@ -117,14 +125,21 @@ namespace Impl { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - HostBlas::syr( uplo[0] \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ + if (A_is_ll) { \ + HostBlas::syr( uplo[0] \ + , N \ + , alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + else { \ + /* blasSsyr() + ~A_ll => call kk_syr() */ \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasSsyr() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -169,19 +184,28 @@ namespace Impl { KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ + /* No blasZsyr() => call kk_syr() */ \ KOKKOS_IMPL_DO_NOT_USE_PRINTF("No blasZsyr(); calling kk_syr\n"); /*AquiEPP*/ \ - kk_syr( space, trans, uplo, alpha, X, A); \ + kk_syr(space, trans, uplo, alpha, X, A); \ } \ else { \ - const std::complex alpha_val = static_cast>(alpha); \ - HostBlas>::zher( uplo[0] \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ + if (A_is_ll) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Calling blasZher() with A_is_ll = true\n"); /*AquiEPP*/ \ + const std::complex alpha_val = static_cast>(alpha); \ + HostBlas>::zher( uplo[0] \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + /* blasZher() + ~A_ll => call kk_syr() */ \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZher() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -227,19 +251,28 @@ namespace Impl { KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ + /* No blasCsyr() => call kk_syr() */ \ KOKKOS_IMPL_DO_NOT_USE_PRINTF("No blasCsyr(); calling kk_syr\n"); /*AquiEPP*/ \ - kk_syr( space, trans, uplo, alpha, X, A); \ + kk_syr(space, trans, uplo, alpha, X, A); \ } \ else { \ - const std::complex alpha_val = static_cast>(alpha); \ - HostBlas>::cher( uplo[0] \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ + if (A_is_ll) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Calling blasCher() with A_is_ll = true\n"); /*AquiEPP*/ \ + const std::complex alpha_val = static_cast>(alpha); \ + HostBlas>::cher( uplo[0] \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + /* blasCher() + ~A_ll => call kk_syr() */ \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCher() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ } \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index a3030b1189..ffeaa9b4b3 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -222,6 +222,10 @@ void SyrTester< ScalarX << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps << ", _epsAbs = " << _epsAbs << ", _epsRel = " << _epsRel + << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults + << ", useHermitianOption = " << useHermitianOption + << ", useUpOption = " << useUpOption << std::endl; // ******************************************************************** @@ -441,6 +445,16 @@ void SyrTester< ScalarX Kokkos::deep_copy(h_x, x); Kokkos::deep_copy(h_A, A); } + + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i,j) + << std::endl; + } + } + } + } // Code for complex values @@ -736,6 +750,16 @@ SyrTester< ScalarX , const _ViewTypeExpected & h_vanilla , const _ViewTypeExpected & h_expected ) { + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i,j) + << std::endl; + } + } + } + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); if (_useAnalyticalResults) { @@ -822,6 +846,7 @@ SyrTester< ScalarX } } // for j } // for i + { std::ostringstream msg; msg << ", A is " << _M << " by " << _N @@ -936,6 +961,16 @@ SyrTester< ScalarX , const _ViewTypeExpected & h_vanilla , const _ViewTypeExpected & h_expected ) { + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i,j) + << std::endl; + } + } + } + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); if (_useAnalyticalResults) { @@ -984,6 +1019,7 @@ SyrTester< ScalarX } } // for j } // for i + { std::ostringstream msg; msg << ", A is " << _M << " by " << _N @@ -1053,6 +1089,16 @@ SyrTester< ScalarX , const _HostViewTypeA & h_A , const _ViewTypeExpected & h_expected ) { + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) + << ", h_A(" << i << "," << j << ")=" << h_A(i,j) + << std::endl; + } + } + } + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); int numErrorsRealAbs (0); @@ -1137,6 +1183,7 @@ SyrTester< ScalarX } } // for j } // for i + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll @@ -1247,6 +1294,16 @@ SyrTester< ScalarX , const _HostViewTypeA & h_A , const _ViewTypeExpected & h_expected ) { + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) + << ", h_A(" << i << "," << j << ")=" << h_A(i,j) + << std::endl; + } + } + } + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); int numErrorsAbs (0); @@ -1412,28 +1469,29 @@ int test_syr( const std::string & caseName ) { tester.test(2, 0); tester.test(13, 0); tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0 , true, false, false); //tester.test(1024, 0 , true, false, true); //tester.test(1024, 0 , true, true, false); //tester.test(1024, 0 , true, true, true); } - else { - //tester.test(1024, 0 , false, false, true); - //tester.test(1024, 0 , false, true, false); - //tester.test(1024, 0 , false, true, true); - } + + tester.test(2, 0 , false, false, true); + tester.test(1024, 0 , false, false, true); + tester.test(2, 0 , false, true, false); + tester.test(1024, 0 , false, true, false); + tester.test(2, 0 , false, true, true); + tester.test(1024, 0 , false, true, true); + tester.test(50, 4 ); tester.test(1024, 0); tester.test(2131, 0); + if (useAnalyticalResults) { //tester.test(2131, 0 , true, false, true); //tester.test(2131, 0 , true, true, true); } - else { - //tester.test(2131, 0 , false, false, true); - //tester.test(2131, 0 , false, true, true); - } } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTLEFT\n", caseName.c_str() ); @@ -1452,28 +1510,32 @@ int test_syr( const std::string & caseName ) { tester.test(2, 0); tester.test(13, 0); tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0, true, false, false); //tester.test(1024, 0, true, false, true); //tester.test(1024, 0, true, true, false); //tester.test(1024, 0, true, true, true); } - else { - //tester.test(1024, 0, false, false, true); - //tester.test(1024, 0, false, true, false); - //tester.test(1024, 0, false, true, true); - } + + tester.test(2, 0, false, false, true); + tester.test(1024, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(1024, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(1024, 0, false, true, true); + tester.test(50, 4); tester.test(1024, 0); tester.test(2131, 0); + if (useAnalyticalResults) { //tester.test(2131, 0, true, false, true); //tester.test(2131, 0, true, true, true); } - else { - //tester.test(2131, 0, false, false, true); - //tester.test(2131, 0, false, true, true); - } + + tester.test(2131, 0, false, false, true); + tester.test(2131, 0, false, true, true); } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTRIGHT\n", caseName.c_str() ); @@ -1488,30 +1550,33 @@ int test_syr( const std::string & caseName ) { if (true) { Test::SyrTester tester; tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); tester.test(13, 0); tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0, true, false, false); //tester.test(1024, 0, true, false, true); //tester.test(1024, 0, true, true, false); //tester.test(1024, 0, true, true, true); } - else { - //tester.test(1024, 0, false, false, true); - //tester.test(1024, 0, false, true, false); - //tester.test(1024, 0, false, true, true); - } + + tester.test(2, 0, false, false, true); + tester.test(1024, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(1024, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(1024, 0, false, true, true); + tester.test(50, 4); tester.test(1024, 0); tester.test(2131, 0); + if (useAnalyticalResults) { //tester.test(2131, 0, true, false, true); //tester.test(2131, 0, true, true, true); } - else { - //tester.test(2131, 0, false, false, true); - //tester.test(2131, 0, false, true, true); - } } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTSTRIDE\n", caseName.c_str() ); @@ -1524,15 +1589,19 @@ int test_syr( const std::string & caseName ) { if (true) { Test::SyrTester tester; + tester.test(1, 0); + tester.test(2, 0); tester.test(1024, 0); + if (useAnalyticalResults) { //tester.test(1024, 0, true, false, true); //tester.test(1024, 0, true, true, true); } - else { - //tester.test(1024, 0, false, false, true); - //tester.test(1024, 0, false, true, true); - } + + tester.test(2, 0, false, false, true); + tester.test(1024, 0, false, false, true); + tester.test(2, 0, false, true, true); + tester.test(1024, 0, false, true, true); } if (true) { From 26b41c7150e9dabcee011d8c94278ba10f87bac7 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 00:49:21 -0600 Subject: [PATCH 018/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index ffeaa9b4b3..5809df154e 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -239,16 +239,6 @@ void SyrTester< ScalarX #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS _kkSyrShouldThrowException = false; - if (_A_is_complex && _useHermitianOption) { - if ((_testIsGpu == false) && - (_A_is_ll == false)) { - _kkSyrShouldThrowException = true; - } - else if ((_testIsGpu == true ) && - (_A_is_ll == false)) { - _kkSyrShouldThrowException = true; - } - } #endif bool test_x (false); From f317609ef8546131a34423d827b34a3891c97bed Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 01:24:15 -0600 Subject: [PATCH 019/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 117 +++++++++++++++--------------- 1 file changed, 58 insertions(+), 59 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 5809df154e..90ea71627f 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -160,6 +160,7 @@ class SyrTester bool _useHermitianOption; bool _useUpOption; bool _kkSyrShouldThrowException; + bool _kkGerShouldThrowException; }; template @@ -186,6 +187,7 @@ SyrTester< ScalarX , _useHermitianOption (false) , _useUpOption (false) , _kkSyrShouldThrowException (false) + , _kkGerShouldThrowException (false) { } @@ -239,6 +241,11 @@ void SyrTester< ScalarX #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS _kkSyrShouldThrowException = false; + + _kkGerShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + _kkGerShouldThrowException = !_A_is_ll; + } #endif bool test_x (false); @@ -318,6 +325,19 @@ void SyrTester< ScalarX , h_expected.d_view , "non const x" ); + + if ((_useAnalyticalResults == false) && + (_kkGerShouldThrowException == false)) { +#if 0 // AquiEEP + this->compareKkSyrAgainstKkGer( alpha + , x.d_view + , A.d_view + , A.h_view + , h_expected.d_view + , "non const x" + ); +#endif + } } // ******************************************************************** @@ -514,7 +534,7 @@ SyrTester< ScalarX ( (_useUpOption == false) && (i >= j) )) { auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_expected(i,j).real() = -2. * sin(auxI) * sin(auxJ); + h_expected(i,j).real() = -2. * sin(auxI) * sin(auxJ); // AquiEEP h_expected(i,j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); } } @@ -579,7 +599,7 @@ SyrTester< ScalarX if (( (_useUpOption == true ) && (i <= j) ) || ( (_useUpOption == false) && (i >= j) )) { auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_expected(i,j) = 3 * sin(auxIpJ); + h_expected(i,j) = 3 * sin(auxIpJ); // AquiEEP } } } @@ -1461,27 +1481,21 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0 , true, false, false); - //tester.test(1024, 0 , true, false, true); - //tester.test(1024, 0 , true, true, false); - //tester.test(1024, 0 , true, true, true); + //tester.test(1024, 0 , true, false, false); + //tester.test(1024, 0 , true, false, true); + //tester.test(1024, 0 , true, true, false); + //tester.test(1024, 0 , true, true, true); } - tester.test(2, 0 , false, false, true); - tester.test(1024, 0 , false, false, true); - tester.test(2, 0 , false, true, false); - tester.test(1024, 0 , false, true, false); - tester.test(2, 0 , false, true, true); - tester.test(1024, 0 , false, true, true); + tester.test(2, 0 , false, false, true); + tester.test(50, 0 , false, false, true); + tester.test(2, 0 , false, true, false); + tester.test(50, 0 , false, true, false); + tester.test(2, 0 , false, true, true); + tester.test(50, 0 , false, true, true); tester.test(50, 4 ); - tester.test(1024, 0); tester.test(2131, 0); - - if (useAnalyticalResults) { - //tester.test(2131, 0 , true, false, true); - //tester.test(2131, 0 , true, true, true); - } } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTLEFT\n", caseName.c_str() ); @@ -1502,30 +1516,21 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0, true, false, false); - //tester.test(1024, 0, true, false, true); - //tester.test(1024, 0, true, true, false); - //tester.test(1024, 0, true, true, true); + //tester.test(1024, 0, true, false, false); + //tester.test(1024, 0, true, false, true); + //tester.test(1024, 0, true, true, false); + //tester.test(1024, 0, true, true, true); } - tester.test(2, 0, false, false, true); - tester.test(1024, 0, false, false, true); - tester.test(2, 0, false, true, false); - tester.test(1024, 0, false, true, false); - tester.test(2, 0, false, true, true); - tester.test(1024, 0, false, true, true); + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); tester.test(50, 4); - tester.test(1024, 0); tester.test(2131, 0); - - if (useAnalyticalResults) { - //tester.test(2131, 0, true, false, true); - //tester.test(2131, 0, true, true, true); - } - - tester.test(2131, 0, false, false, true); - tester.test(2131, 0, false, true, true); } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTRIGHT\n", caseName.c_str() ); @@ -1546,27 +1551,21 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0, true, false, false); - //tester.test(1024, 0, true, false, true); - //tester.test(1024, 0, true, true, false); - //tester.test(1024, 0, true, true, true); + //tester.test(1024, 0, true, false, false); + //tester.test(1024, 0, true, false, true); + //tester.test(1024, 0, true, true, false); + //tester.test(1024, 0, true, true, true); } - tester.test(2, 0, false, false, true); - tester.test(1024, 0, false, false, true); - tester.test(2, 0, false, true, false); - tester.test(1024, 0, false, true, false); - tester.test(2, 0, false, true, true); - tester.test(1024, 0, false, true, true); + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); tester.test(50, 4); - tester.test(1024, 0); tester.test(2131, 0); - - if (useAnalyticalResults) { - //tester.test(2131, 0, true, false, true); - //tester.test(2131, 0, true, true, true); - } } KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTSTRIDE\n", caseName.c_str() ); @@ -1584,14 +1583,14 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0, true, false, true); - //tester.test(1024, 0, true, true, true); + //tester.test(1024, 0, true, false, true); + //tester.test(1024, 0, true, true, true); } - tester.test(2, 0, false, false, true); - tester.test(1024, 0, false, false, true); - tester.test(2, 0, false, true, true); - tester.test(1024, 0, false, true, true); + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); } if (true) { From 37b1f353b4493c3eeb181a184632c5d8759a0498 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 08:59:04 -0600 Subject: [PATCH 020/231] Backup --- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 160 ++++++++---------- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 57 ++++--- 2 files changed, 109 insertions(+), 108 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 6abf8ea9ca..b75a0a451b 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -65,7 +65,6 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ if (A_is_ll) { \ @@ -80,7 +79,6 @@ namespace Impl { } \ else { \ /* blasDsyr() + ~A_ll => call kk_syr() */ \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasDsyr() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ @@ -122,7 +120,6 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ if (A_is_ll) { \ @@ -137,22 +134,84 @@ namespace Impl { } \ else { \ /* blasSsyr() + ~A_ll => call kk_syr() */ \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasSsyr() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr() => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + else { \ + if (A_is_ll) { \ + const std::complex alpha_val = static_cast>(alpha); \ + HostBlas>::zher( uplo[0] \ + , N \ + , alpha_val /*AquiEEP*/ \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + /* blasZher() + ~A_ll => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ + , Kokkos::View< const Kokkos::complex* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ - , Kokkos::View< Kokkos::complex** \ + , Kokkos::View< Kokkos::complex** \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ @@ -160,7 +219,7 @@ namespace Impl { , true \ , ETI_SPEC_AVAIL \ > { \ - typedef Kokkos::complex SCALAR; \ + typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ , Kokkos::Device \ @@ -179,103 +238,32 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ - /* No blasZsyr() => call kk_syr() */ \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("No blasZsyr(); calling kk_syr\n"); /*AquiEPP*/ \ + /* No blasCsyr() => call kk_syr() */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ else { \ if (A_is_ll) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Calling blasZher() with A_is_ll = true\n"); /*AquiEPP*/ \ - const std::complex alpha_val = static_cast>(alpha); \ - HostBlas>::zher( uplo[0] \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - /* blasZher() + ~A_ll => call kk_syr() */ \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZher() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ - kk_syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-blas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasCsyr() => call kk_syr() */ \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("No blasCsyr(); calling kk_syr\n"); /*AquiEPP*/ \ - kk_syr(space, trans, uplo, alpha, X, A); \ - } \ - else { \ - if (A_is_ll) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Calling blasCher() with A_is_ll = true\n"); /*AquiEPP*/ \ const std::complex alpha_val = static_cast>(alpha); \ HostBlas>::cher( uplo[0] \ , N \ - , alpha_val \ + , alpha_val /*AquiEEP*/ \ , reinterpret_cast*>(X.data()) \ , one \ , reinterpret_cast*>(A.data()) \ , LDA \ ); \ - } \ + } \ else { \ /* blasCher() + ~A_ll => call kk_syr() */ \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCher() + ~A_ll => call kk_syr\n"); /*AquiEPP*/ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index 8b28e7ad44..80f9fd1e1e 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -22,13 +22,14 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ - bool A_is_lr = std::is_same::value; \ - const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; #define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ @@ -65,7 +66,6 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ @@ -79,7 +79,7 @@ namespace Impl { , A.data() \ , LDA \ ) \ - ); \ + ); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ Kokkos::Profiling::popRegion(); \ } \ @@ -120,7 +120,6 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ @@ -175,22 +174,29 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: cublasZsyru() is not supported."); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyr( s.handle \ + , fillMode \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ } \ else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyr( s.handle /*AquiEEP*/ \ + const double alpha_val = alpha.real(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZher( s.handle \ , fillMode \ , N \ - , reinterpret_cast(&alpha) \ + , &alpha_val /*AquiEEP*/ \ , reinterpret_cast(X.data()) \ , one \ , reinterpret_cast(A.data()) \ @@ -238,22 +244,29 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: cublasCsyru() is not supported."); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyr( s.handle \ + , fillMode \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ } \ else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyr( s.handle /*AquiEEP*/ \ + const float alpha_val = alpha.real(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCher( s.handle \ , fillMode \ , N \ - , reinterpret_cast(&alpha) \ + , &alpha_val /*AquiEEP*/ \ , reinterpret_cast(X.data()) \ , one \ , reinterpret_cast(A.data()) \ From f58df0702705582debf5e82ef6754d365ec1568d Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 11:30:48 -0600 Subject: [PATCH 021/231] Backup --- blas/tpls/KokkosBlas_Host_tpl.cpp | 10 ++++++---- blas/unit_test/Test_Blas2_syr.hpp | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 6dbe6e84bb..0608250327 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -279,7 +279,7 @@ void F77_BLAS_MANGLE(dsyr, DSYR)( const char* void F77_BLAS_MANGLE(cher, CHER)( const char* , int* - , const std::complex* + , const float* , const std::complex* , int* , std::complex* @@ -287,7 +287,7 @@ void F77_BLAS_MANGLE(cher, CHER)( const char* ); void F77_BLAS_MANGLE(zher, ZHER)( const char* , int* - , const std::complex* + , const double* , const std::complex* , int* , std::complex* @@ -937,9 +937,10 @@ void HostBlas >::cher( const char uplo , std::complex* a , int lda ) { + const float alphaVal = alpha.real(); F77_FUNC_CHER( &uplo , &n - , &alpha + , &alphaVal , (const std::complex*)x , &incx , (std::complex*)a @@ -1125,9 +1126,10 @@ void HostBlas >::zher( const char uplo , std::complex* a , int lda ) { + const double alphaVal = alpha.real(); F77_FUNC_ZHER( &uplo , &n - , &alpha + , &alphaVal , (const std::complex*)x , &incx , (std::complex*)a diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 90ea71627f..ea5bd6f638 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -175,7 +175,7 @@ SyrTester< ScalarX , _A_is_ll ( std::is_same< tLayoutA, Kokkos::LayoutLeft >::value ) , _testIsGpu ( KokkosKernels::Impl::kk_is_gpu_exec_space< typename Device::execution_space >() ) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - , _vanillaUsesDifferentOrderOfOps( _A_is_lr && _testIsGpu ) + , _vanillaUsesDifferentOrderOfOps( _A_is_lr && _testIsGpu ) // AquiEEP #else , _vanillaUsesDifferentOrderOfOps( false ) #endif @@ -1416,6 +1416,7 @@ void SyrTester< ScalarX , const std::string & situation ) { + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkSyrShouldThrowException ); std::string mode = _useHermitianOption ? "H" : "T"; std::string uplo = _useUpOption ? "U" : "L"; From 3b2ee150599c0f10ae72b874abf2385aee626c56 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 20:11:20 -0600 Subject: [PATCH 022/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index ea5bd6f638..8fe4e3830a 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -450,6 +450,7 @@ void SyrTester< ScalarX ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); Kokkos::fill_random(A, rand_pool, randStart, randEnd); + // AquiEEP: make A symmetric or hermitian } Kokkos::deep_copy(h_x, x); From 01ee52e91a17b64044657a60dc1b9c18033fb956 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 21:39:38 -0600 Subject: [PATCH 023/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 49 ++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 8fe4e3830a..243da2ccdc 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -59,6 +59,10 @@ class SyrTester , bool & expectedResultIsKnown ); + void makeMatrixSymmetric(_HostViewTypeA & h_A); + + void makeMatrixHermitian(_HostViewTypeA & h_A); + template typename std::enable_if< std::is_same>::value || std::is_same>::value , void @@ -450,11 +454,18 @@ void SyrTester< ScalarX ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); Kokkos::fill_random(A, rand_pool, randStart, randEnd); - // AquiEEP: make A symmetric or hermitian } Kokkos::deep_copy(h_x, x); Kokkos::deep_copy(h_A, A); + + if (_useHermitianOption && _A_is_complex) { + this->makeMatrixHermitian(h_A); + } + else { + this->makeMatrixSymmetric(h_A); + } + Kokkos::deep_copy(A, h_A); } if (_N <= 2) { @@ -468,6 +479,42 @@ void SyrTester< ScalarX } +template +void +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::makeMatrixSymmetric(_HostViewTypeA & h_A) +{ + for (int i(0); i < _N; ++i) { + for (int j(i+1); j < _N; ++j) { + h_A(i,j) = h_A(j,i); + } + } +} + +template +void +SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::makeMatrixHermitian(_HostViewTypeA & h_A) +{ + for (int i(0); i < _N; ++i) { + for (int j(i+1); j < _N; ++j) { + h_A(i,j) = _KAT_A::conj( h_A(j,i) ); + } + } + + for (int i(0); i < _N; ++i) { + h_A(i,i) = 0.5 * ( h_A(i,i) + _KAT_A::conj( h_A(i,i) ) ); + } +} + // Code for complex values template template From 5c8d270f789ab6689c244ce0644b09496f7be6df Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 16 May 2023 22:58:08 -0600 Subject: [PATCH 024/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 243da2ccdc..6b20da782e 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -682,6 +682,9 @@ SyrTester< ScalarX } } } + for (int i = 0; i < _N; ++i) { + h_vanilla(i,i).imag() = 0.; + } } else { for (int i = 0; i < _M; ++i) { @@ -710,6 +713,9 @@ SyrTester< ScalarX } } } + for (int i = 0; i < _N; ++i) { + h_vanilla(i,i).imag() = 0.; + } } else { for (int i = 0; i < _M; ++i) { From ebac83552205ca0d405c8e05c5fb8bd9538d2f11 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 17 May 2023 00:14:58 -0600 Subject: [PATCH 025/231] Backup --- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 523 ++++++++++-------- 1 file changed, 283 insertions(+), 240 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index 80f9fd1e1e..b2de8c5a26 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -23,6 +23,7 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ @@ -31,252 +32,294 @@ namespace Impl { cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; -#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char /*trans*/[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ - , fillMode \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const double* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ + , fillMode \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + } \ + else { \ + /* cublasDsyr() + ~A_ll => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char /*trans*/[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ - , fillMode \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const float* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ + , fillMode \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + } \ + else { \ + /* cublasSsyr() + ~A_ll => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyr( s.handle \ - , fillMode \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - const double alpha_val = alpha.real(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZher( s.handle \ - , fillMode \ - , N \ - , &alpha_val /*AquiEEP*/ \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyr( s.handle \ + , fillMode \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + } \ + else { \ + /* cublasZsyr() + ~A_ll => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZher( s.handle \ + , fillMode \ + , N \ + , &alpha_val /*AquiEEP*/ \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + } \ + else { \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyr( s.handle \ - , fillMode \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - const float alpha_val = alpha.real(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCher( s.handle \ - , fillMode \ - , N \ - , &alpha_val /*AquiEEP*/ \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyr( s.handle \ + , fillMode \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + } \ + else { \ + /* cublasCsyr() + ~A_ll => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCher( s.handle \ + , fillMode \ + , N \ + , &alpha_val /*AquiEEP*/ \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + } \ + else { \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) From 8efa6c6237bc82948e564056bbe1cbe6b2d7f335 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 17 May 2023 10:48:45 -0600 Subject: [PATCH 026/231] Backup --- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 242 +++++++++--------- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 4 +- blas/tpls/KokkosBlas_Host_tpl.cpp | 40 +-- blas/tpls/KokkosBlas_Host_tpl.hpp | 6 +- 4 files changed, 146 insertions(+), 146 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index b75a0a451b..69e512c94a 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -140,130 +140,128 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasZsyr() => call kk_syr() */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ - } \ - else { \ - if (A_is_ll) { \ - const std::complex alpha_val = static_cast>(alpha); \ - HostBlas>::zher( uplo[0] \ - , N \ - , alpha_val /*AquiEEP*/ \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - /* blasZher() + ~A_ll => call kk_syr() */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr() => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + else { \ + if (A_is_ll) { \ + HostBlas>::zher( uplo[0] \ + , N \ + , alpha.real() \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + /* blasZher() + [~A_ll or ~real alpha] => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasCsyr() => call kk_syr() */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ - } \ - else { \ - if (A_is_ll) { \ - const std::complex alpha_val = static_cast>(alpha); \ - HostBlas>::cher( uplo[0] \ - , N \ - , alpha_val /*AquiEEP*/ \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - /* blasCher() + ~A_ll => call kk_syr() */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr() => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + HostBlas>::cher( uplo[0] \ + , N \ + , alpha.real() \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + /* blasCher() + [~A_ll or ~real alpha] => call kk_syr() */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index b2de8c5a26..84acc75a5c 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -219,7 +219,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZher( s.handle \ , fillMode \ , N \ - , &alpha_val /*AquiEEP*/ \ + , &alpha_val \ , reinterpret_cast(X.data()) \ , one \ , reinterpret_cast(A.data()) \ @@ -304,7 +304,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCher( s.handle \ , fillMode \ , N \ - , &alpha_val /*AquiEEP*/ \ + , &alpha_val \ , reinterpret_cast(X.data()) \ , one \ , reinterpret_cast(A.data()) \ diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 0608250327..d8183d7ab9 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -929,18 +929,18 @@ void HostBlas >::gerc( &lda); } template <> -void HostBlas >::cher( const char uplo - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , std::complex* a - , int lda - ) { - const float alphaVal = alpha.real(); +template <> +void HostBlas >::cher( const char uplo + , int n + , const float alpha + , const std::complex* x + , int incx + , std::complex* a + , int lda + ) { F77_FUNC_CHER( &uplo , &n - , &alphaVal + , &alpha , (const std::complex*)x , &incx , (std::complex*)a @@ -1118,18 +1118,18 @@ void HostBlas >::gerc( (std::complex*)a, &lda); } template <> -void HostBlas >::zher( const char uplo - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , std::complex* a - , int lda - ) { - const double alphaVal = alpha.real(); +template <> +void HostBlas >::zher( const char uplo + , int n + , const double alpha + , const std::complex* x + , int incx + , std::complex* a + , int lda + ) { F77_FUNC_ZHER( &uplo , &n - , &alphaVal + , &alpha , (const std::complex*)x , &incx , (std::complex*)a diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index d06406d764..8b6391c92d 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -82,18 +82,20 @@ struct HostBlas { , int lda ); + template< typename tAlpha > static void cher( const char uplo , int n - , const T alpha + , const tAlpha alpha , const T* x , int incx , T* a , int lda ); + template< typename tAlpha > static void zher( const char uplo , int n - , const T alpha + , const tAlpha alpha , const T* x , int incx , T* a From 2854aa0b44e828f4b7396fc5dcb8761aba16c53a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 17 May 2023 11:55:11 -0600 Subject: [PATCH 027/231] Backup --- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 12 +- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 12 +- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 441 ++++++++++-------- 3 files changed, 261 insertions(+), 204 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 69e512c94a..ce2e14f61b 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -78,7 +78,7 @@ namespace Impl { ); \ } \ else { \ - /* blasDsyr() + ~A_ll => call kk_syr() */ \ + /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ @@ -133,7 +133,7 @@ namespace Impl { ); \ } \ else { \ - /* blasSsyr() + ~A_ll => call kk_syr() */ \ + /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ @@ -179,7 +179,7 @@ namespace Impl { KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ - /* No blasZsyr() => call kk_syr() */ \ + /* No blasZsyr() => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ else { \ @@ -194,7 +194,7 @@ namespace Impl { ); \ } \ else { \ - /* blasZher() + [~A_ll or ~real alpha] => call kk_syr() */ \ + /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ } \ @@ -241,7 +241,7 @@ namespace Impl { KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ - /* No blasCsyr() => call kk_syr() */ \ + /* No blasCsyr() => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ else { \ @@ -256,7 +256,7 @@ namespace Impl { ); \ } \ else { \ - /* blasCher() + [~A_ll or ~real alpha] => call kk_syr() */ \ + /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ } \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index 84acc75a5c..bceedf000f 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -85,7 +85,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ } \ else { \ - /* cublasDsyr() + ~A_ll => call kk_syr() */ \ + /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ @@ -145,7 +145,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ } \ else { \ - /* cublasSsyr() + ~A_ll => call kk_syr() */ \ + /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ @@ -207,7 +207,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ } \ else { \ - /* cublasZsyr() + ~A_ll => call kk_syr() */ \ + /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ } \ @@ -229,7 +229,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ } \ else { \ - /* cublasZher() + [~A_ll or ~real alpha]=> call kk_syr() */ \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ } \ @@ -292,7 +292,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ } \ else { \ - /* cublasCsyr() + ~A_ll => call kk_syr() */ \ + /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ } \ @@ -314,7 +314,7 @@ namespace Impl { KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ } \ else { \ - /* cublasCher() + [~A_ll or ~real alpha]=> call kk_syr() */ \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ kk_syr(space, trans, uplo, alpha, X, A); \ } \ } \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 38870efda7..d7173a17c1 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -23,131 +23,229 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower : rocblas_fill_upper; -#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dsyr-rocblas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const double* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ + } \ + else { \ + /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-ssyr-rocblas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const float* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ + , uplo \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ + } \ + else { \ + /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< EXEC_SPACE \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void syr( const typename AViewType::execution_space & space \ + , const char trans[] \ + , const char uplo[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const AViewType & A \ + ) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyr( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } \ + else { \ + /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zher( s.handle \ + , uplo \ + , N \ + , &alpha_val \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } \ + else { \ + /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ + , Kokkos::View< const Kokkos::complex* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ - , Kokkos::View< Kokkos::complex** \ + , Kokkos::View< Kokkos::complex** \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ @@ -155,7 +253,7 @@ namespace Impl { , true \ , ETI_SPEC_AVAIL \ > { \ - typedef Kokkos::complex SCALAR; \ + typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ , Kokkos::Device \ @@ -174,97 +272,56 @@ namespace Impl { , const XViewType & X \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zsyr-rocblas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: rocblasZsyru() is not supported."); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyr( s.handle \ + , uplo \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } \ + else { \ + /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ } \ else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cher( s.handle \ + , uplo \ + , N \ + , &alpha_val \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } \ + else { \ + /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ + kk_syr(space, trans, uplo, alpha, X, A); \ + } \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-csyr-rocblas\n" ); \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (justTranspose) { \ - kk_syr( space, trans, uplo, alpha, X, A); \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCsyru() is not supported\n"); /* AquiEPP */ \ - throw std::runtime_error("Error: rocblasCsyru() is not supported."); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyrc( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) From c8198d9e3d0c22f783c49e2fa70fa82fe12aecf7 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 17 May 2023 21:18:14 -0600 Subject: [PATCH 028/231] Backup --- blas/src/KokkosBlas2_syr.hpp | 4 +- blas/unit_test/Test_Blas2_syr.hpp | 407 +++++++++++++++++------------- 2 files changed, 229 insertions(+), 182 deletions(-) diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 9878ede263..0b21b570df 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -21,7 +21,7 @@ namespace KokkosBlas { -/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. // AquiEEP /// /// \tparam ExecutionSpace The type of execution space /// \tparam XViewType Input vector, as a 1-D Kokkos::View @@ -127,7 +127,7 @@ void syr( const ExecutionSpace & space ); } -/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. // AquiEEP /// /// \tparam XViewType Input vector, as a 1-D Kokkos::View /// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 6b20da782e..c4ae585de9 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -59,10 +59,6 @@ class SyrTester , bool & expectedResultIsKnown ); - void makeMatrixSymmetric(_HostViewTypeA & h_A); - - void makeMatrixHermitian(_HostViewTypeA & h_A); - template typename std::enable_if< std::is_same>::value || std::is_same>::value , void @@ -107,37 +103,37 @@ class SyrTester typename std::enable_if< std::is_same>::value || std::is_same>::value , void >::type - compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ); + compareVanillaAgainstExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ); template typename std::enable_if< !std::is_same>::value && !std::is_same>::value , void >::type - compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ); + compareVanillaAgainstExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ); template typename std::enable_if< std::is_same>::value || std::is_same>::value , void >::type - compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ); + compareKkSyrAgainstReference( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_reference + ); template typename std::enable_if< !std::is_same>::value && !std::is_same>::value , void >::type - compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ); + compareKkSyrAgainstReference( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_reference + ); template T shrinkAngleToZeroTwoPiRange(const T input); @@ -151,6 +147,14 @@ class SyrTester , const std::string & situation ); + template + void callKkGerAndCompareKkSyrAgainstIt( const ScalarA & alpha + , TX & x + , _ViewTypeA & h_A_orig + , const _ViewTypeExpected & h_A_syr + , const std::string & situation + ); + const bool _A_is_complex; const bool _A_is_lr; const bool _A_is_ll; @@ -179,7 +183,7 @@ SyrTester< ScalarX , _A_is_ll ( std::is_same< tLayoutA, Kokkos::LayoutLeft >::value ) , _testIsGpu ( KokkosKernels::Impl::kk_is_gpu_exec_space< typename Device::execution_space >() ) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - , _vanillaUsesDifferentOrderOfOps( _A_is_lr && _testIsGpu ) // AquiEEP + , _vanillaUsesDifferentOrderOfOps( _A_is_lr ) // && _testIsGpu ) // AquiEEP #else , _vanillaUsesDifferentOrderOfOps( false ) #endif @@ -303,10 +307,10 @@ void SyrTester< ScalarX // ****************************************************************** // Compare h_vanilla against h_expected // ****************************************************************** - this->compareVanillaExpected( alpha - , h_vanilla.d_view - , h_expected.d_view - ); + this->compareVanillaAgainstExpected( alpha + , h_vanilla.d_view + , h_expected.d_view + ); } else { // ****************************************************************** @@ -318,7 +322,7 @@ void SyrTester< ScalarX // ******************************************************************** // Step 5 of 7: test with 'non const x' // ******************************************************************** - view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); // AquiEEP (see ger as well) + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); Kokkos::deep_copy(org_A.d_base, A.d_base); if (test_x) { @@ -330,17 +334,14 @@ void SyrTester< ScalarX , "non const x" ); - if ((_useAnalyticalResults == false) && + if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { -#if 0 // AquiEEP - this->compareKkSyrAgainstKkGer( alpha - , x.d_view - , A.d_view - , A.h_view - , h_expected.d_view - , "non const x" - ); -#endif + this->callKkGerAndCompareKkSyrAgainstIt( alpha + , x.d_view + , org_A.d_view + , A.h_view + , "non const x" + ); } } @@ -460,10 +461,24 @@ void SyrTester< ScalarX Kokkos::deep_copy(h_A, A); if (_useHermitianOption && _A_is_complex) { - this->makeMatrixHermitian(h_A); + // Make h_A Hermitian + for (int i(0); i < _N; ++i) { + for (int j(i+1); j < _N; ++j) { + h_A(i,j) = _KAT_A::conj( h_A(j,i) ); + } + } + + for (int i(0); i < _N; ++i) { + h_A(i,i) = 0.5 * ( h_A(i,i) + _KAT_A::conj( h_A(i,i) ) ); + } } else { - this->makeMatrixSymmetric(h_A); + // Make h_A symmetric + for (int i(0); i < _N; ++i) { + for (int j(i+1); j < _N; ++j) { + h_A(i,j) = h_A(j,i); + } + } } Kokkos::deep_copy(A, h_A); } @@ -479,42 +494,6 @@ void SyrTester< ScalarX } -template -void -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::makeMatrixSymmetric(_HostViewTypeA & h_A) -{ - for (int i(0); i < _N; ++i) { - for (int j(i+1); j < _N; ++j) { - h_A(i,j) = h_A(j,i); - } - } -} - -template -void -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::makeMatrixHermitian(_HostViewTypeA & h_A) -{ - for (int i(0); i < _N; ++i) { - for (int j(i+1); j < _N; ++j) { - h_A(i,j) = _KAT_A::conj( h_A(j,i) ); - } - } - - for (int i(0); i < _N; ++i) { - h_A(i,i) = 0.5 * ( h_A(i,i) + _KAT_A::conj( h_A(i,i) ) ); - } -} - // Code for complex values template template @@ -810,10 +789,10 @@ SyrTester< ScalarX , ScalarA , tLayoutA , Device - >::compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ) { + >::compareVanillaAgainstExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { @@ -1021,10 +1000,10 @@ SyrTester< ScalarX , ScalarA , tLayoutA , Device - >::compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ) { + >::compareVanillaAgainstExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { @@ -1149,14 +1128,14 @@ SyrTester< ScalarX , ScalarA , tLayoutA , Device - >::compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ) { + >::compareKkSyrAgainstReference( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_reference + ) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i,j) << ", h_A(" << i << "," << j << ")=" << h_A(i,j) << std::endl; } @@ -1180,9 +1159,9 @@ SyrTester< ScalarX int jForMaxErrorImagRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()); + diff = _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()); errorHappened = false; - if (h_expected(i,j).real() == 0.) { + if (h_reference(i,j).real() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; @@ -1190,14 +1169,14 @@ SyrTester< ScalarX } } else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + _AuxType aux = diff / _KAT_A::abs(h_reference(i,j).real()); if (maxErrorRealRel < aux) { maxErrorRealRel = aux; iForMaxErrorRealRel = i; jForMaxErrorRealRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); + diffThreshold = _KAT_A::abs(_epsRel * h_reference(i,j).real()); if (diff > diffThreshold) { errorHappened = true; numErrorsRealRel++; @@ -1206,16 +1185,16 @@ SyrTester< ScalarX if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i,j).real() - << ", h_A(i,j).real() = " << h_A(i,j).real() - << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " << diff - << ", diffThreshold = " << diffThreshold + << ": h_reference(i,j).real() = " << h_reference(i,j).real() + << ", h_A(i,j).real() = " << h_A(i,j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; } - diff = _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()); + diff = _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()); errorHappened = false; - if (h_expected(i,j).imag() == 0.) { + if (h_reference(i,j).imag() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; @@ -1223,14 +1202,14 @@ SyrTester< ScalarX } } else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + _AuxType aux = diff / _KAT_A::abs(h_reference(i,j).imag()); if (maxErrorImagRel < aux) { maxErrorImagRel = aux; iForMaxErrorImagRel = i; jForMaxErrorImagRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); + diffThreshold = _KAT_A::abs(_epsRel * h_reference(i,j).imag()); if (diff > diffThreshold) { errorHappened = true; numErrorsImagRel++; @@ -1239,48 +1218,38 @@ SyrTester< ScalarX if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() - << ", h_A(i,j).imag() = " << h_A(i,j).imag() - << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " << diff - << ", diffThreshold = " << diffThreshold + << ": h_reference(i,j).imag() = " << h_reference(i,j).imag() + << ", h_A(i,j).imag() = " << h_A(i,j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; } } // for j } // for i std::cout << "A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { - std::cout << "Information" - << ": A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_expected(11, 2119) = (" << h_expected(11,2119).real() << ", " << h_expected(11,2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11,2119).real() << ", " << h_A(11,2119).imag() << ")" - << std::endl; std::cout << "Information" << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr @@ -1288,28 +1257,38 @@ SyrTester< ScalarX << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption - << ", h_expected(710, 1065) = (" << h_expected(710,1065).real() << ", " << h_expected(710,1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710,1065).real() << ", " << h_A(710,1065).imag() << ")" + << ", h_reference(11, 2119) = (" << h_reference(11,2119).real() << ", " << h_reference(11,2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11,2119).real() << ", " << h_A(11,2119).imag() << ")" + << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(710, 1065) = (" << h_reference(710,1065).real() << ", " << h_reference(710,1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710,1065).real() << ", " << h_A(710,1065).imag() << ")" << std::endl; } { std::ostringstream msg; msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { @@ -1320,20 +1299,20 @@ SyrTester< ScalarX { std::ostringstream msg; msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { @@ -1354,14 +1333,14 @@ SyrTester< ScalarX , ScalarA , tLayoutA , Device - >::compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ) { + >::compareKkSyrAgainstReference( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_reference + ) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i,j) << ", h_A(" << i << "," << j << ")=" << h_A(i,j) << std::endl; } @@ -1380,9 +1359,9 @@ SyrTester< ScalarX int jForMaxErrorRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j) - h_A(i,j)); + diff = _KAT_A::abs(h_reference(i,j) - h_A(i,j)); errorHappened = false; - if (h_expected(i,j) == 0.) { + if (h_reference(i,j) == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; @@ -1390,14 +1369,14 @@ SyrTester< ScalarX } } else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + _AuxType aux = diff / _KAT_A::abs(h_reference(i,j)); if (maxErrorRel < aux) { maxErrorRel = aux; iForMaxErrorRel = i; jForMaxErrorRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + diffThreshold = _KAT_A::abs(_epsRel * h_reference(i,j)); if (diff > diffThreshold) { errorHappened = true; numErrorsRel++; @@ -1406,10 +1385,10 @@ SyrTester< ScalarX if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i,j) - << ", h_A(i,j) = " << h_A(i,j) - << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff - << ", diffThreshold = " << diffThreshold + << ": h_reference(i,j) = " << h_reference(i,j) + << ", h_A(i,j) = " << h_A(i,j) + << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; } } // for j @@ -1425,7 +1404,7 @@ SyrTester< ScalarX << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_reference(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; @@ -1443,7 +1422,7 @@ SyrTester< ScalarX << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_reference(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; @@ -1497,12 +1476,80 @@ void SyrTester< ScalarX if (( gotStdException == false ) && ( gotUnknownException == false )) { Kokkos::deep_copy(h_A, A); + this->compareKkSyrAgainstReference( alpha + , h_A + , h_expected + ); + } +} - this->compareKokkosExpected( alpha - , h_A - , h_expected - ); +template +template +void SyrTester< ScalarX + , tLayoutX + , ScalarA + , tLayoutA + , Device + >::callKkGerAndCompareKkSyrAgainstIt( const ScalarA & alpha + , TX & x + , _ViewTypeA & h_A_orig + , const _ViewTypeExpected & h_A_syr + , const std::string & situation + ) +{ + view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); + Kokkos::deep_copy(A_ger.d_base, h_A_orig); + + // ******************************************************************** + // Call ger() + // ******************************************************************** + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException ); + std::string mode = _useHermitianOption ? "H" : "T"; + bool gotStdException (false); + bool gotUnknownException(false); + try { + KokkosBlas::ger(mode.c_str(), alpha, x, x, A_ger.d_view); + } + catch( const std::exception& e ) { + std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught exception, e.what() = " << e.what() << std::endl; + gotStdException = true; + } + catch( ... ) { + std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught unknown exception" << std::endl; + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation << "': unknown exception should not have happened for ger() call"; + + EXPECT_EQ(gotStdException, false) << "Failed test, '" << situation << "': kk ger() should not have thrown a std::exception"; + + // ******************************************************************** + // Prepare h_ger_reference to be compared against h_A_syr + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference("h_ger_reference", _M, _N); + Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); + + std::string uplo = _useUpOption ? "U" : "L"; // Aqui + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (( (_useUpOption == true ) && (i <= j) ) || + ( (_useUpOption == false) && (i >= j) )) { + // Keep h_ger_reference as already computed + } + else { + h_ger_reference.d_view(i,j) = h_A_orig(i,j); + } + } } + + // ******************************************************************** + // Compare + // ******************************************************************** + this->compareKkSyrAgainstReference( alpha + , h_A_syr + , h_ger_reference.d_view + ); } } // namespace Test From ddb7623def780b4e9031883631f957068601e8ea Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 17 May 2023 22:12:46 -0600 Subject: [PATCH 029/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index c4ae585de9..71a37aad7b 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -461,7 +461,9 @@ void SyrTester< ScalarX Kokkos::deep_copy(h_A, A); if (_useHermitianOption && _A_is_complex) { + // **************************************************************** // Make h_A Hermitian + // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i+1); j < _N; ++j) { h_A(i,j) = _KAT_A::conj( h_A(j,i) ); @@ -473,7 +475,9 @@ void SyrTester< ScalarX } } else { + // **************************************************************** // Make h_A symmetric + // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i+1); j < _N; ++j) { h_A(i,j) = h_A(j,i); @@ -1542,6 +1546,11 @@ void SyrTester< ScalarX } } } + if (_useHermitianOption && _A_is_complex) { + for (int i(0); i < _N; ++i) { + h_ger_reference.d_view(i,i) = 0.5 * ( h_ger_reference.d_view(i,i) + _KAT_A::conj( h_ger_reference.d_view(i,i) ) ); + } + } // ******************************************************************** // Compare From d373ee4b2451ba08ad12d837d7d9166dacffb23e Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 17 May 2023 22:48:18 -0600 Subject: [PATCH 030/231] Backup --- blas/src/KokkosBlas2_syr.hpp | 38 +++++++++++++++++++++++++++++-- blas/unit_test/Test_Blas2_syr.hpp | 2 +- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 0b21b570df..ddae2471de 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -21,7 +21,21 @@ namespace KokkosBlas { -/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. // AquiEEP +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// symmetric/Hermitian matrix: A = A + alpha * x * x^{T,H}. +/// +/// Important note 1: this routine has the purpose of updating a +/// symmetric (or Hermitian) matrix A in such a way that it continues +/// to be symmetric (or Hermitian). Therefore, in Hermitian cases, +/// the parameter alpha must be real. +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian, and +/// even if a complex alpha is supplied in Hermitian cases. Moreover, +/// this routine will always compute either the lower portion or the +/// upper portion (per user's request) of the final matrix A. So, in +/// order to have meaningful results, the user must make sure to +/// follow the conditions specied in the "important note 1" above. /// /// \tparam ExecutionSpace The type of execution space /// \tparam XViewType Input vector, as a 1-D Kokkos::View @@ -32,6 +46,9 @@ namespace KokkosBlas { /// run on. /// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. /// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. /// \param alpha [in] Input coefficient of x * x^{T,H} /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View @@ -127,13 +144,30 @@ void syr( const ExecutionSpace & space ); } -/// \brief Rank-1 update of a general matrix: A = A + alpha * x * x^{T,H}. // AquiEEP +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// symmetric/Hermitian matrix: A = A + alpha * x * x^{T,H}. +/// +/// Important note 1: this routine has the purpose of updating a +/// symmetric (or Hermitian) matrix A in such a way that it continues +/// to be symmetric (or Hermitian). Therefore, in Hermitian cases, +/// the parameter alpha must be real. +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian, and +/// even if a complex alpha is supplied in Hermitian cases. Moreover, +/// this routine will always compute either the lower portion or the +/// upper portion (per user's request) of the final matrix A. So, in +/// order to have meaningful results, the user must make sure to +/// follow the conditions specied in the "important note 1" above. /// /// \tparam XViewType Input vector, as a 1-D Kokkos::View /// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View /// /// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. /// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. /// \param alpha [in] Input coefficient of x * x^{T,H} /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 71a37aad7b..177f615cab 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -1534,7 +1534,7 @@ void SyrTester< ScalarX view_stride_adapter<_ViewTypeExpected, true> h_ger_reference("h_ger_reference", _M, _N); Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); - std::string uplo = _useUpOption ? "U" : "L"; // Aqui + std::string uplo = _useUpOption ? "U" : "L"; for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { if (( (_useUpOption == true ) && (i <= j) ) || From f6e904b0d0f7cab7091bb5a5d3754ed333a8601e Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 17 May 2023 23:28:24 -0600 Subject: [PATCH 031/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 177f615cab..a15bbafd16 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -150,7 +150,7 @@ class SyrTester template void callKkGerAndCompareKkSyrAgainstIt( const ScalarA & alpha , TX & x - , _ViewTypeA & h_A_orig + , _HostViewTypeA & h_A_orig , const _ViewTypeExpected & h_A_syr , const std::string & situation ); @@ -183,7 +183,7 @@ SyrTester< ScalarX , _A_is_ll ( std::is_same< tLayoutA, Kokkos::LayoutLeft >::value ) , _testIsGpu ( KokkosKernels::Impl::kk_is_gpu_exec_space< typename Device::execution_space >() ) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - , _vanillaUsesDifferentOrderOfOps( _A_is_lr ) // && _testIsGpu ) // AquiEEP + , _vanillaUsesDifferentOrderOfOps( _A_is_lr ) #else , _vanillaUsesDifferentOrderOfOps( false ) #endif @@ -336,9 +336,10 @@ void SyrTester< ScalarX if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { + Kokkos::deep_copy(org_A.h_view, org_A.d_base); this->callKkGerAndCompareKkSyrAgainstIt( alpha , x.d_view - , org_A.d_view + , org_A.h_view , A.h_view , "non const x" ); @@ -1496,7 +1497,7 @@ void SyrTester< ScalarX , Device >::callKkGerAndCompareKkSyrAgainstIt( const ScalarA & alpha , TX & x - , _ViewTypeA & h_A_orig + , _HostViewTypeA & h_A_orig , const _ViewTypeExpected & h_A_syr , const std::string & situation ) From 841226eb73025c66f2055377643e598ac96aa79f Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 18 May 2023 02:17:55 -0600 Subject: [PATCH 032/231] Backup --- blas/src/KokkosBlas1_swap.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index ea864cc3b4..9916e8b19c 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -41,7 +41,7 @@ namespace KokkosBlas { /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking template -void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) { // AquiEEP +void swap(execution_space const& space, XVector const& x, YVector const& y) { // AquiEEP // Assert properties of XVector static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); @@ -93,7 +93,7 @@ void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) Kokkos::Profiling::pushRegion("KokkosBlas::swap"); // If X.extent(0) == 0, do nothing if (X.extent(0) != 0) { -#if 0 // AquiEEP +#if 1 // AquiEEP Impl::Swap::swap(space, X, Y); #endif From 4321f9fb8f977c3ae9bedadbe8585a1131618045 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 18 May 2023 15:25:36 -0600 Subject: [PATCH 033/231] Backup --- blas/impl/KokkosBlas2_syr_impl.hpp | 8 ++-- blas/src/KokkosBlas2_syr.hpp | 32 ++++++++++---- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 43 ++++++++++++++++--- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 1 - .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 1 - 5 files changed, 64 insertions(+), 21 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 9a6d0428a9..3aad2ba768 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -54,7 +54,7 @@ struct SingleLevelSYR { } else { const IndexType N ( A_.extent(1) ); - const XComponentType x_fixed( x_(i) ); + const XComponentType x_fixed( x_(i) ); // Aqui: performance improvement if (justTranspose_) { for (IndexType j = 0; j < N; ++j) { @@ -160,7 +160,7 @@ struct TwoLevelSYR { const IndexType M ( A_.extent(0) ); const IndexType j ( team.league_rank() ); if (justTranspose_) { - const XComponentType x_fixed( x_(j) ); + const XComponentType x_fixed( x_(j) ); // Aqui: performance improvement Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { @@ -169,7 +169,7 @@ struct TwoLevelSYR { }); } else { - const XComponentType x_fixed( Kokkos::ArithTraits::conj( x_(j) ) ); + const XComponentType x_fixed( Kokkos::ArithTraits::conj( x_(j) ) ); // Aqui: performance improvement Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { if (( (justUp_ == true ) && (i <= j) ) || ( (justUp_ == false) && (i >= j) )) { @@ -190,7 +190,7 @@ struct TwoLevelSYR { else { const IndexType N ( A_.extent(1) ); const IndexType i ( team.league_rank() ); - const XComponentType x_fixed( x_(i) ); + const XComponentType x_fixed( x_(i) ); // Aqui: performance improvement if (justTranspose_) { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { if (( (justUp_ == true ) && (i <= j) ) || diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index ddae2471de..2477e3fb5d 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -24,10 +24,11 @@ namespace KokkosBlas { /// \brief Rank-1 update (just lower portion or just upper portion) of a /// symmetric/Hermitian matrix: A = A + alpha * x * x^{T,H}. /// -/// Important note 1: this routine has the purpose of updating a -/// symmetric (or Hermitian) matrix A in such a way that it continues -/// to be symmetric (or Hermitian). Therefore, in Hermitian cases, -/// the parameter alpha must be real. +/// Important note 1: this routine encapsulates the syr() and her() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). Therefore, in +/// Hermitian cases, the parameter alpha must be real. /// /// Important note 2: however, this routine will honor all parameters /// passed to it, even if A is not symmetric or not Hermitian, and @@ -37,6 +38,13 @@ namespace KokkosBlas { /// order to have meaningful results, the user must make sure to /// follow the conditions specied in the "important note 1" above. /// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honorning all +/// parameters passed, as stated in the "important note 2" above. +/// /// \tparam ExecutionSpace The type of execution space /// \tparam XViewType Input vector, as a 1-D Kokkos::View /// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View @@ -147,10 +155,11 @@ void syr( const ExecutionSpace & space /// \brief Rank-1 update (just lower portion or just upper portion) of a /// symmetric/Hermitian matrix: A = A + alpha * x * x^{T,H}. /// -/// Important note 1: this routine has the purpose of updating a -/// symmetric (or Hermitian) matrix A in such a way that it continues -/// to be symmetric (or Hermitian). Therefore, in Hermitian cases, -/// the parameter alpha must be real. +/// Important note 1: this routine encapsulates the syr() and her() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). Therefore, in +/// Hermitian cases, the parameter alpha must be real. /// /// Important note 2: however, this routine will honor all parameters /// passed to it, even if A is not symmetric or not Hermitian, and @@ -160,6 +169,13 @@ void syr( const ExecutionSpace & space /// order to have meaningful results, the user must make sure to /// follow the conditions specied in the "important note 1" above. /// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honorning all +/// parameters passed, as stated in the "important note 2" above. +/// /// \tparam XViewType Input vector, as a 1-D Kokkos::View /// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View /// diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index ce2e14f61b..92422c34ae 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -25,7 +25,6 @@ namespace Impl { #define KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT) \ bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ - const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); @@ -79,7 +78,12 @@ namespace Impl { } \ else { \ /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -134,7 +138,12 @@ namespace Impl { } \ else { \ /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -180,7 +189,12 @@ namespace Impl { bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ /* No blasZsyr() => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ else { \ if (A_is_ll) { \ @@ -195,7 +209,12 @@ namespace Impl { } \ else { \ /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ Kokkos::Profiling::popRegion(); \ @@ -242,7 +261,12 @@ namespace Impl { bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ /* No blasCsyr() => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ else { \ if (A_is_ll && (alpha.imag() == 0.)) { \ @@ -257,7 +281,12 @@ namespace Impl { } \ else { \ /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ Kokkos::Profiling::popRegion(); \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index bceedf000f..0d536cb592 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -25,7 +25,6 @@ namespace Impl { #define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ - const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index d7173a17c1..f09a3d2fd9 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -25,7 +25,6 @@ namespace Impl { #define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ - const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ From c7d9341a667529f83f02e035ad4b679e85b6d485 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 18 May 2023 17:40:58 -0600 Subject: [PATCH 034/231] Backup --- blas/impl/KokkosBlas2_syr_impl.hpp | 6 +-- blas/impl/KokkosBlas2_syr_spec.hpp | 47 ++++++------------- blas/src/KokkosBlas1_swap.hpp | 4 +- blas/src/KokkosBlas2_syr.hpp | 8 ++-- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 42 ++++++++++++++--- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 42 ++++++++++++++--- 6 files changed, 96 insertions(+), 53 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 3aad2ba768..7bd66c55bf 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -48,7 +48,7 @@ struct SingleLevelSYR { // Nothing to do } - KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { // AquiEEP + KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } @@ -150,7 +150,7 @@ struct TwoLevelSYR { public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutLeftTag // AquiEEP + KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutLeftTag , const member_type & team ) const { if (alpha_ == Kokkos::ArithTraits::zero()) { @@ -181,7 +181,7 @@ struct TwoLevelSYR { } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutRightTag // AquiEEP + KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutRightTag , const member_type & team ) const { if (alpha_ == Kokkos::ArithTraits::zero()) { diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 91906dcd5a..786122868b 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -20,9 +20,9 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" -//#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // AquiEEP +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include -//#endif +#endif namespace KokkosBlas { namespace Impl { @@ -68,20 +68,23 @@ namespace Impl { // syr // +// Implementation of KokkosBlas::syr. template < class ExecutionSpace , class XViewType , class AViewType + , bool tpl_spec_avail = syr_tpl_spec_avail::value + , bool eti_spec_avail = syr_eti_spec_avail::value > -static void kk_syr( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) -{ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering KokkosBlas::Impl::Syr::syr()\n" ); - +struct SYR { + static void syr( const ExecutionSpace & space + , const char trans[] + , const char uplo[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const AViewType & A + ) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr[ETI]" : "KokkosBlas::syr[noETI]"); typedef typename AViewType::size_type size_type; @@ -110,26 +113,6 @@ static void kk_syr( const ExecutionSpace & space } Kokkos::Profiling::popRegion(); -} - -// Implementation of KokkosBlas::syr. -template < class ExecutionSpace - , class XViewType - , class AViewType - , bool tpl_spec_avail = syr_tpl_spec_avail::value - , bool eti_spec_avail = syr_eti_spec_avail::value - > -struct SYR { - static void syr( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) -#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - { - kk_syr(space, trans, uplo, alpha, x, A); } #else ; diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index 9916e8b19c..ea864cc3b4 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -41,7 +41,7 @@ namespace KokkosBlas { /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking template -void swap(execution_space const& space, XVector const& x, YVector const& y) { // AquiEEP +void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) { // AquiEEP // Assert properties of XVector static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); @@ -93,7 +93,7 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { // Kokkos::Profiling::pushRegion("KokkosBlas::swap"); // If X.extent(0) == 0, do nothing if (X.extent(0) != 0) { -#if 1 // AquiEEP +#if 0 // AquiEEP Impl::Swap::swap(space, X, Y); #endif diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 2477e3fb5d..173b1843bb 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -35,14 +35,14 @@ namespace KokkosBlas { /// even if a complex alpha is supplied in Hermitian cases. Moreover, /// this routine will always compute either the lower portion or the /// upper portion (per user's request) of the final matrix A. So, in -/// order to have meaningful results, the user must make sure to +/// order to obtain meaningful results, the user must make sure to /// follow the conditions specied in the "important note 1" above. /// /// Important note 3: if TPL is enabled, this routine will call the /// third party library BLAS routines whenever the parameters passed /// are consistent with the parameters expected by the corresponding /// TPL routine. If not, then this routine will route the execution -/// to the kokkos-kernels implementation, thus honorning all +/// to the kokkos-kernels implementation, thus honoring all /// parameters passed, as stated in the "important note 2" above. /// /// \tparam ExecutionSpace The type of execution space @@ -166,14 +166,14 @@ void syr( const ExecutionSpace & space /// even if a complex alpha is supplied in Hermitian cases. Moreover, /// this routine will always compute either the lower portion or the /// upper portion (per user's request) of the final matrix A. So, in -/// order to have meaningful results, the user must make sure to +/// order to obtain meaningful results, the user must make sure to /// follow the conditions specied in the "important note 1" above. /// /// Important note 3: if TPL is enabled, this routine will call the /// third party library BLAS routines whenever the parameters passed /// are consistent with the parameters expected by the corresponding /// TPL routine. If not, then this routine will route the execution -/// to the kokkos-kernels implementation, thus honorning all +/// to the kokkos-kernels implementation, thus honoring all /// parameters passed, as stated in the "important note 2" above. /// /// \tparam XViewType Input vector, as a 1-D Kokkos::View diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index 0d536cb592..c06864356f 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -85,7 +85,12 @@ namespace Impl { } \ else { \ /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -145,7 +150,12 @@ namespace Impl { } \ else { \ /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -207,7 +217,12 @@ namespace Impl { } \ else { \ /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ else { \ @@ -229,7 +244,12 @@ namespace Impl { } \ else { \ /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ Kokkos::Profiling::popRegion(); \ @@ -292,7 +312,12 @@ namespace Impl { } \ else { \ /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ else { \ @@ -314,7 +339,12 @@ namespace Impl { } \ else { \ /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ Kokkos::Profiling::popRegion(); \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index f09a3d2fd9..8a96e5d4f1 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -85,7 +85,12 @@ namespace Impl { } \ else { \ /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -145,7 +150,12 @@ namespace Impl { } \ else { \ /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -207,7 +217,12 @@ namespace Impl { } \ else { \ /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ else { \ @@ -229,7 +244,12 @@ namespace Impl { } \ else { \ /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ Kokkos::Profiling::popRegion(); \ @@ -292,7 +312,12 @@ namespace Impl { } \ else { \ /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ else { \ @@ -314,7 +339,12 @@ namespace Impl { } \ else { \ /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - kk_syr(space, trans, uplo, alpha, X, A); \ + SYR< EXEC_SPACE \ + , XViewType \ + , AViewType \ + , false \ + , ETI_SPEC_AVAIL \ + >::syr( space, trans, uplo, alpha, X, A ); \ } \ } \ Kokkos::Profiling::popRegion(); \ From e3263a9b0a4781396e3d88488119a65bcb55fefa Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 00:05:27 -0600 Subject: [PATCH 035/231] Backup --- blas/impl/KokkosBlas2_syr_impl.hpp | 81 +++++++++++++++++------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 7bd66c55bf..5b340df3db 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -52,9 +52,12 @@ struct SingleLevelSYR { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } + else if (x_(i) == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const XComponentType x_fixed( x_(i) ); const IndexType N ( A_.extent(1) ); - const XComponentType x_fixed( x_(i) ); // Aqui: performance improvement if (justTranspose_) { for (IndexType j = 0; j < N; ++j) { @@ -157,25 +160,30 @@ struct TwoLevelSYR { // Nothing to do } else { - const IndexType M ( A_.extent(0) ); - const IndexType j ( team.league_rank() ); - if (justTranspose_) { - const XComponentType x_fixed( x_(j) ); // Aqui: performance improvement - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); - } - }); + const IndexType j( team.league_rank() ); + if (x_(j) == Kokkos::ArithTraits::zero()) { + // Nothing to do } else { - const XComponentType x_fixed( Kokkos::ArithTraits::conj( x_(j) ) ); // Aqui: performance improvement - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); - } - }); + const IndexType M( A_.extent(0) ); + if (justTranspose_) { + const XComponentType x_fixed( x_(j) ); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); + } + }); + } + else { + const XComponentType x_fixed( Kokkos::ArithTraits::conj( x_(j) ) ); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); + } + }); + } } } } @@ -188,24 +196,29 @@ struct TwoLevelSYR { // Nothing to do } else { - const IndexType N ( A_.extent(1) ); - const IndexType i ( team.league_rank() ); - const XComponentType x_fixed( x_(i) ); // Aqui: performance improvement - if (justTranspose_) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_fixed * x_(j) ); - } - }); + const IndexType i( team.league_rank() ); + if (x_(i) == Kokkos::ArithTraits::zero()) { + // Nothing to do } else { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( x_(j) ) ); - } - }); + const IndexType N ( A_.extent(1) ); + const XComponentType x_fixed( x_(i) ); + if (justTranspose_) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += AComponentType( alpha_ * x_fixed * x_(j) ); + } + }); + } + else { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { + if (( (justUp_ == true ) && (i <= j) ) || + ( (justUp_ == false) && (i >= j) )) { + A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( x_(j) ) ); + } + }); + } } } team.team_barrier(); From 8284ac6fa7778fb7a22ab99d7bb5f8709dc51f01 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 12:14:44 -0600 Subject: [PATCH 036/231] Backup --- blas/unit_test/Test_Blas2_syr.hpp | 116 +++++++++++++++--------------- 1 file changed, 57 insertions(+), 59 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index a15bbafd16..a36b1661d2 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -515,73 +515,75 @@ SyrTester< ScalarX , _HostViewTypeA & h_A , _ViewTypeExpected & h_expected ) { - _AuxType auxI(0.); - _AuxType auxJ(0.); - _AuxType auxIpJ(0.); - _AuxType auxImJ(0.); - - alpha.real() = 1.; - alpha.imag() = -1.; + if (_useHermitianOption) { + alpha.real() = 1.; + alpha.imag() = 0.; + } + else { + alpha.real() = 1.; + alpha.imag() = -1.; + } for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); h_x[i].real() = sin(auxI); h_x[i].imag() = cos(auxI); } if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); for (int j = 0; j < _N; ++j) { + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); if (( (_useUpOption == true ) && (i <= j) ) || ( (_useUpOption == false) && (i >= j) )) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_A(i,j).real() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); - h_A(i,j).imag() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + h_A(i,j).real() = cos(auxImJ); + h_A(i,j).imag() = -sin(auxImJ); + } + else { + h_A(i,j).real() = cos(auxImJ); + h_A(i,j).imag() = sin(auxImJ); } } } } else { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); - h_A(i,j).real() = -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); - h_A(i,j).imag() = -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); - } + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_A(i,j).real() = sin(auxIpJ) + cos(auxIpJ); + h_A(i,j).imag() = sin(auxIpJ) - cos(auxIpJ); } } } if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); for (int j = 0; j < _N; ++j) { if (( (_useUpOption == true ) && (i <= j) ) || ( (_useUpOption == false) && (i >= j) )) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_expected(i,j).real() = -2. * sin(auxI) * sin(auxJ); // AquiEEP - h_expected(i,j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); + h_expected(i,j).real() = 2. * cos(auxImJ); + h_expected(i,j).imag() = -2. * sin(auxImJ); + } + else { + h_expected(i,j).real() = h_A(i,j).real(); + h_expected(i,j).imag() = h_A(i,j).imag(); } } } } else { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); for (int j = 0; j < _N; ++j) { if (( (_useUpOption == true ) && (i <= j) ) || ( (_useUpOption == false) && (i >= j) )) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); - h_expected(i,j).real() = 2. * cos(auxI) * cos(auxJ); - h_expected(i,j).imag() = -2. * sin(auxImJ); + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_expected(i,j).real() = 2. * sin(auxIpJ); + h_expected(i,j).imag() = 2. * sin(auxIpJ); + } + else { + h_expected(i,j).real() = h_A(i,j).real(); + h_expected(i,j).imag() = h_A(i,j).imag(); } } } @@ -604,25 +606,18 @@ SyrTester< ScalarX , _HostViewTypeA & h_A , _ViewTypeExpected & h_expected ) { - _AuxType auxI(0.); - _AuxType auxJ(0.); - _AuxType auxIpJ(0.); - - alpha = 3; + alpha = 2; for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); h_x[i] = sin(auxI); } for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - h_A(i,j) = 3 * cos(auxI) * sin(auxJ); - } + _AuxType auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + h_A(i,j) = 2 * cos(auxI) * cos(auxJ); } } @@ -630,8 +625,11 @@ SyrTester< ScalarX for (int j = 0; j < _N; ++j) { if (( (_useUpOption == true ) && (i <= j) ) || ( (_useUpOption == false) && (i >= j) )) { - auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_expected(i,j) = 3 * sin(auxIpJ); // AquiEEP + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); + h_expected(i,j) = 2 * cos(auxImJ); + } + else { + h_expected(i,j) = h_A(i,j); } } } @@ -1593,10 +1591,10 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0 , true, false, false); - //tester.test(1024, 0 , true, false, true); - //tester.test(1024, 0 , true, true, false); - //tester.test(1024, 0 , true, true, true); + tester.test(1024, 0 , true, false, false); + tester.test(1024, 0 , true, false, true); + tester.test(1024, 0 , true, true, false); + tester.test(1024, 0 , true, true, true); } tester.test(2, 0 , false, false, true); @@ -1628,10 +1626,10 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0, true, false, false); - //tester.test(1024, 0, true, false, true); - //tester.test(1024, 0, true, true, false); - //tester.test(1024, 0, true, true, true); + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); } tester.test(2, 0, false, false, true); @@ -1663,10 +1661,10 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0, true, false, false); - //tester.test(1024, 0, true, false, true); - //tester.test(1024, 0, true, true, false); - //tester.test(1024, 0, true, true, true); + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); } tester.test(2, 0, false, false, true); @@ -1695,8 +1693,8 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - //tester.test(1024, 0, true, false, true); - //tester.test(1024, 0, true, true, true); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, true); } tester.test(2, 0, false, false, true); From acfbdd6894aae95d3322c63f988c5a3cf52aa8f1 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 12:53:35 -0600 Subject: [PATCH 037/231] Formatting --- blas/impl/KokkosBlas2_syr_impl.hpp | 326 ++- blas/impl/KokkosBlas2_syr_spec.hpp | 140 +- blas/src/KokkosBlas2_syr.hpp | 99 +- blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp | 226 +- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 474 ++--- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 639 +++--- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 595 +++--- blas/unit_test/Test_Blas2_syr.hpp | 1896 ++++++++--------- 8 files changed, 2062 insertions(+), 2333 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 5b340df3db..82966de639 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -33,89 +33,80 @@ struct SingleLevelSYR { using XComponentType = typename XViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - SingleLevelSYR( const bool justTranspose - , const bool justUp - , const AlphaCoeffType & alpha - , const XViewType & x - , const AViewType & A - ) - : justTranspose_(justTranspose) - , justUp_ (justUp) - , alpha_ (alpha) - , x_ (x) - , A_ (A) - { + SingleLevelSYR(const bool justTranspose, const bool justUp, + const AlphaCoeffType& alpha, const XViewType& x, + const AViewType& A) + : justTranspose_(justTranspose), + justUp_(justUp), + alpha_(alpha), + x_(x), + A_(A) { // Nothing to do } - KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else if (x_(i) == Kokkos::ArithTraits::zero()) { + } else if (x_(i) == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const XComponentType x_fixed( x_(i) ); - const IndexType N ( A_.extent(1) ); + } else { + const XComponentType x_fixed(x_(i)); + const IndexType N(A_.extent(1)); if (justTranspose_) { for (IndexType j = 0; j < N; ++j) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_fixed * x_(j) ); + if (((justUp_ == true) && (i <= j)) || + ((justUp_ == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); } } - } - else { + } else { for (IndexType j = 0; j < N; ++j) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( x_(j) ) ); + if (((justUp_ == true) && (i <= j)) || + ((justUp_ == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(x_(j))); } } } } } -private: - bool justTranspose_; - bool justUp_; - AlphaCoeffType alpha_; + private: + bool justTranspose_; + bool justUp_; + AlphaCoeffType alpha_; typename XViewType::const_type x_; - AViewType A_; + AViewType A_; }; // Single-level parallel version of SYR. template -void singleLevelSyr( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL singleLevelSyr(), AViewType = %s\n", typeid(AViewType).name() ); +void singleLevelSyr(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Entering IMPL singleLevelSyr(), AViewType = %s\n", + typeid(AViewType).name()); - static_assert(std::is_integral::value, "IndexType must be an integer"); + static_assert(std::is_integral::value, + "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; if (x.extent(0) == 0) { // no entries to update - } - else if (alpha == Kokkos::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update - } - else { - Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); - SingleLevelSYR functor( (trans[0] == 'T') || (trans[0] == 't') - , (uplo[0] == 'U') || (uplo[0] == 'u') - , alpha - , x - , A - ); + } else { + Kokkos::RangePolicy rangePolicy(space, 0, + A.extent(0)); + SingleLevelSYR functor( + (trans[0] == 'T') || (trans[0] == 't'), + (uplo[0] == 'U') || (uplo[0] == 'u'), alpha, x, A); Kokkos::parallel_for("KokkosBlas::syr[SingleLevel]", rangePolicy, functor); } } @@ -125,157 +116,150 @@ struct TwoLevelSYR_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- -// Functor for a two-level parallel_reduce version of SYR, designed for performance on GPU. -// Kernel depends on the layout of A. -template +// Functor for a two-level parallel_reduce version of SYR, designed for +// performance on GPU. Kernel depends on the layout of A. +template struct TwoLevelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - TwoLevelSYR( const bool justTranspose - , const bool justUp - , const AlphaCoeffType & alpha - , const XViewType & x - , const AViewType & A - ) - : justTranspose_(justTranspose) - , justUp_ (justUp) - , alpha_ (alpha) - , x_ (x) - , A_ (A) - { + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TwoLevelSYR(const bool justTranspose, const bool justUp, + const AlphaCoeffType& alpha, const XViewType& x, + const AViewType& A) + : justTranspose_(justTranspose), + justUp_(justUp), + alpha_(alpha), + x_(x), + A_(A) { // Nothing to do } -public: + public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutLeftTag - , const member_type & team - ) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelSYR_LayoutLeftTag, + const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const IndexType j( team.league_rank() ); + } else { + const IndexType j(team.league_rank()); if (x_(j) == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const IndexType M( A_.extent(0) ); + } else { + const IndexType M(A_.extent(0)); if (justTranspose_) { - const XComponentType x_fixed( x_(j) ); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); - } - }); - } - else { - const XComponentType x_fixed( Kokkos::ArithTraits::conj( x_(j) ) ); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_(i) * x_fixed ); - } - }); + const XComponentType x_fixed(x_(j)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((justUp_ == true) && (i <= j)) || + ((justUp_ == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); + } else { + const XComponentType x_fixed( + Kokkos::ArithTraits::conj(x_(j))); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((justUp_ == true) && (i <= j)) || + ((justUp_ == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); } } } } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()( TwoLevelSYR_LayoutRightTag - , const member_type & team - ) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelSYR_LayoutRightTag, + const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const IndexType i( team.league_rank() ); + } else { + const IndexType i(team.league_rank()); if (x_(i) == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const IndexType N ( A_.extent(1) ); - const XComponentType x_fixed( x_(i) ); + } else { + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); if (justTranspose_) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_fixed * x_(j) ); - } - }); - } - else { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - if (( (justUp_ == true ) && (i <= j) ) || - ( (justUp_ == false) && (i >= j) )) { - A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( x_(j) ) ); - } - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((justUp_ == true) && (i <= j)) || + ((justUp_ == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); + } + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((justUp_ == true) && (i <= j)) || + ((justUp_ == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); } } } team.team_barrier(); } -private: - bool justTranspose_; - bool justUp_; - AlphaCoeffType alpha_; + private: + bool justTranspose_; + bool justUp_; + AlphaCoeffType alpha_; typename XViewType::const_type x_; - AViewType A_; + AViewType A_; }; // Two-level parallel version of SYR. template -void twoLevelSyr( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL twoLevelSyr(), AViewType = %s\n", typeid(AViewType).name() ); +void twoLevelSyr(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Entering IMPL twoLevelSyr(), AViewType = %s\n", + typeid(AViewType).name()); - static_assert(std::is_integral::value, "IndexType must be an integer"); + static_assert(std::is_integral::value, + "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; if (x.extent(0) == 0) { // no entries to update return; - } - else if (alpha == Kokkos::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update return; } - constexpr bool isLayoutLeft = std::is_same::value; - using layout_tag = typename std::conditional::type; + constexpr bool isLayoutLeft = + std::is_same::value; + using layout_tag = + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { // LayoutLeft: one team per column teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); - } - else { + } else { // LayoutRight: one team per row teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TwoLevelSYR functor( (trans[0] == 'T') || (trans[0] == 't') - , (uplo[0] == 'U') || (uplo[0] == 'u') - , alpha - , x - , A - ); + TwoLevelSYR functor( + (trans[0] == 'T') || (trans[0] == 't'), + (uplo[0] == 'U') || (uplo[0] == 'u'), alpha, x, A); Kokkos::parallel_for("KokkosBlas::syr[twoLevel]", teamPolicy, functor); } @@ -285,37 +269,31 @@ void twoLevelSyr( const ExecutionSpace & space // depending on whether execution space is CPU or GPU. // The 'enable_if' makes sure unused kernels are not instantiated. -template < class ExecutionSpace - , class XViewType - , class AViewType - , class IndexType - , typename std::enable_if() >::type* = nullptr - > -void generalSyrImpl( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalSyrImpl(CPU), AViewType = %s\n", typeid(AViewType).name() ); +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Entering IMPL generalSyrImpl(CPU), AViewType = %s\n", + typeid(AViewType).name()); singleLevelSyr(space, trans, uplo, alpha, x, A); } -template < class ExecutionSpace - , class XViewType - , class AViewType - , class IndexType - , typename std::enable_if()>::type* = nullptr - > -void generalSyrImpl( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalSyrImpl(GPU), AViewType = %s\n", typeid(AViewType).name() ); +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Entering IMPL generalSyrImpl(GPU), AViewType = %s\n", + typeid(AViewType).name()); twoLevelSyr(space, trans, uplo, alpha, x, A); } diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 786122868b..37e173f6e0 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -35,26 +35,21 @@ struct syr_eti_spec_avail { } // namespace KokkosBlas // -// Macro for declaration of full specialization availability KokkosBlas::Impl::SYR. -// This is NOT for users!!! -// All the declarations of full specializations go in this header file. -// We may spread out definitions (see _INST macro below) across one or more .cpp files. +// Macro for declaration of full specialization availability +// KokkosBlas::Impl::SYR. This is NOT for users!!! All the declarations of full +// specializations go in this header file. We may spread out definitions (see +// _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr_eti_spec_avail< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -69,54 +64,42 @@ namespace Impl { // // Implementation of KokkosBlas::syr. -template < class ExecutionSpace - , class XViewType - , class AViewType - , bool tpl_spec_avail = syr_tpl_spec_avail::value - , bool eti_spec_avail = syr_eti_spec_avail::value - > +template ::value, + bool eti_spec_avail = + syr_eti_spec_avail::value> struct SYR { - static void syr( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) + static void syr(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr[ETI]" : "KokkosBlas::syr[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::syr[ETI]" + : "KokkosBlas::syr[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); // Prefer int as the index type, but use a larsyr type if needed. - if (( numRows < static_cast(INT_MAX) ) && - ( numCols < static_cast(INT_MAX) )) { - generalSyrImpl( space - , trans - , uplo - , alpha - , x - , A - ); - } - else { - generalSyrImpl( space - , trans - , uplo - , alpha - , x - , A - ); + if ((numRows < static_cast(INT_MAX)) && + (numCols < static_cast(INT_MAX))) { + generalSyrImpl( + space, trans, uplo, alpha, x, A); + } else { + generalSyrImpl( + space, trans, uplo, alpha, x, A); } Kokkos::Profiling::popRegion(); } #else - ; -#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY }; } // namespace Impl @@ -126,41 +109,30 @@ struct SYR { // Macro for declaration of full specialization of KokkosBlas::Impl::SYR. // This is NOT for users!!! // All the declarations of full specializations go in this header file. -// We may spread out definitions (see _DEF macro below) across one or more .cpp files. +// We may spread out definitions (see _DEF macro below) across one or more .cpp +// files. // -#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct SYR< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , false \ - , true \ - >; +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; #define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct SYR< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , false \ - , true \ - >; + template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; #include #include -#endif // KOKKOSBLAS2_SYR_SPEC_HPP_ +#endif // KOKKOSBLAS2_SYR_SPEC_HPP_ diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 173b1843bb..fb36b3acf0 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -61,14 +61,12 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void syr( const ExecutionSpace & space - , const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering SRC KokkosBlas::syr(), AViewType = %s\n", typeid(AViewType).name() ); +void syr(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Entering SRC KokkosBlas::syr(), AViewType = %s\n", + typeid(AViewType).name()); static_assert( Kokkos::SpaceAccessibility::accessible, "XViewType memory space must be accessible from ExecutionSpace"); - static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); - static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "XViewType must be a Kokkos::View."); - static_assert( static_cast(AViewType::rank) == 2, "AViewType must have rank 2." ); - static_assert( static_cast(XViewType::rank) == 1, "XViewType must have rank 1." ); + static_assert(static_cast(AViewType::rank) == 2, + "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, + "XViewType must have rank 1."); // Check compatibility of dimensions at run time. - if (( A.extent(0) != x.extent(0) ) || - ( A.extent(1) != x.extent(0) )) { + if ((A.extent(0) != x.extent(0)) || (A.extent(1) != x.extent(0))) { std::ostringstream os; os << "KokkosBlas::syr: Dimensions of A, x: " - << "A is " << A.extent(0) << " by " << A.extent(1) - << ", x has size " << x.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " + << x.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -110,13 +111,10 @@ void syr( const ExecutionSpace & space KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((uplo[0] == 'U') || - (uplo[0] == 'u') || - (uplo[0] == 'L') || + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l')) { // Ok - } - else { + } else { std::ostringstream oss; oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; @@ -129,27 +127,21 @@ void syr( const ExecutionSpace & space using ALayout = typename AViewType::array_layout; - // Minimize the number of Impl::SYR instantiations, by standardizing + // Minimize the number of Impl::SYR instantiations, by standardizing // on particular View specializations for its template parameters. - typedef Kokkos::View< typename XViewType::const_value_type* - , typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout - , typename XViewType::device_type - , Kokkos::MemoryTraits - > XVT; - - typedef Kokkos::View< typename AViewType::non_const_value_type** - , ALayout - , typename AViewType::device_type - , Kokkos::MemoryTraits - > AVT; - - Impl::SYR::syr( space - , trans - , uplo - , alpha - , x - , A - ); + typedef Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits > + XVT; + + typedef Kokkos::View > + AVT; + + Impl::SYR::syr(space, trans, uplo, alpha, x, A); } /// \brief Rank-1 update (just lower portion or just upper portion) of a @@ -188,22 +180,15 @@ void syr( const ExecutionSpace & space /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void syr( const char trans[] - , const char uplo[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const AViewType & A - ) { - const typename AViewType::execution_space space = typename AViewType::execution_space(); - syr( space - , trans - , uplo - , alpha - , x - , A - ); +void syr(const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + syr( + space, trans, uplo, alpha, x, A); } -} // namespace KokkosBlas +} // namespace KokkosBlas -#endif // KOKKOSBLAS2_SYR_HPP_ +#endif // KOKKOSBLAS2_SYR_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp index 2be20a44af..69b90e85bf 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -28,45 +28,59 @@ struct syr_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif @@ -74,77 +88,105 @@ KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex , Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) #endif } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ +#endif // KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 92422c34ae..6b64fce2bc 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -31,59 +31,37 @@ namespace Impl { #define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ if (A_is_ll) { \ - HostBlas::syr( uplo[0] \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - else { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ + LDA); \ + } else { \ /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -91,250 +69,214 @@ namespace Impl { #define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ if (A_is_ll) { \ - HostBlas::syr( uplo[0] \ - , N \ - , alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - else { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ + LDA); \ + } else { \ /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasZsyr() => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - else { \ - if (A_is_ll) { \ - HostBlas>::zher( uplo[0] \ - , N \ - , alpha.real() \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr() => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::zher( \ + uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasCsyr() => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - HostBlas>::cher( uplo[0] \ - , N \ - , alpha.real() \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr() => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + HostBlas>::cher( \ + uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index c06864356f..dad3c93dbc 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -22,374 +22,309 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? CUBLAS_FILL_MODE_LOWER \ + : CUBLAS_FILL_MODE_UPPER; -#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDsyr( s.handle \ - , fillMode \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - } \ - else { \ - /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr( \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSsyr( s.handle \ - , fillMode \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - } \ - else { \ - /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr( \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZsyr( s.handle \ - , fillMode \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - } \ - else { \ - /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZher( s.handle \ - , fillMode \ - , N \ - , &alpha_val \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - } \ - else { \ - /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher( \ + s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCsyr( s.handle \ - , fillMode \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - } \ - else { \ - /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCher( s.handle \ - , fillMode \ - , N \ - , &alpha_val \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - } \ - else { \ - /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 8a96e5d4f1..c90ff729e5 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -22,354 +22,277 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? rocblas_fill_lower : rocblas_fill_upper; +#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower \ + : rocblas_fill_upper; -#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dsyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ - } \ - else { \ - /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr( \ + s.handle, uplo, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_ssyr( s.handle \ - , uplo \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ - } \ - else { \ - /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr( \ + s.handle, uplo, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zsyr( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - else { \ - /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zher( s.handle \ - , uplo \ - , N \ - , &alpha_val \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - else { \ - /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr( \ + s.handle, uplo, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher( \ + s.handle, uplo, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< EXEC_SPACE \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void syr( const typename AViewType::execution_space & space \ - , const char trans[] \ - , const char uplo[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_csyr( s.handle \ - , uplo \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - else { \ - /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cher( s.handle \ - , uplo \ - , N \ - , &alpha_val \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - else { \ - /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' implementation */ \ - SYR< EXEC_SPACE \ - , XViewType \ - , AViewType \ - , false \ - , ETI_SPEC_AVAIL \ - >::syr( space, trans, uplo, alpha, X, A ); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr( \ + s.handle, uplo, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher( \ + s.handle, uplo, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index a36b1661d2..00a58bdd86 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -24,225 +24,189 @@ namespace Test { constexpr double piVal = 3.14159265358979323846; -template -class SyrTester -{ -public: +template +class SyrTester { + public: SyrTester(); ~SyrTester(); - void test( const int N - , const int nonConstConstCombinations - , const bool useAnalyticalResults = false - , const bool useHermitianOption = false - , const bool useUpOption = false - ); + void test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults = false, + const bool useHermitianOption = false, + const bool useUpOption = false); -private: - typedef Kokkos::View _ViewTypeX; + private: + typedef Kokkos::View _ViewTypeX; typedef Kokkos::View _ViewTypeA; - typedef typename _ViewTypeX::HostMirror _HostViewTypeX; - typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View _ViewTypeExpected; + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef Kokkos::View + _ViewTypeExpected; typedef Kokkos::ArithTraits _KAT_A; - typedef typename _KAT_A::mag_type _AuxType; + typedef typename _KAT_A::mag_type _AuxType; - void populateVariables( ScalarA & alpha - , _HostViewTypeX & h_x - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - , _ViewTypeX & x - , _ViewTypeA & A - , bool & expectedResultIsKnown - ); + void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected, + _ViewTypeX& x, _ViewTypeA& A, + bool& expectedResultIsKnown); template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ); - + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); + template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ); - + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ); - + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - compareVanillaAgainstExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - compareVanillaAgainstExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - compareKkSyrAgainstReference( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_reference - ); + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - compareKkSyrAgainstReference( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_reference - ); + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyrAndCompareAgainstExpected( const ScalarA & alpha - , TX & x - , _ViewTypeA & A - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - , const std::string & situation - ); + void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation); template - void callKkGerAndCompareKkSyrAgainstIt( const ScalarA & alpha - , TX & x - , _HostViewTypeA & h_A_orig - , const _ViewTypeExpected & h_A_syr - , const std::string & situation - ); - - const bool _A_is_complex; - const bool _A_is_lr; - const bool _A_is_ll; - const bool _testIsGpu; - const bool _vanillaUsesDifferentOrderOfOps; + void callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, + _HostViewTypeA& h_A_orig, + const _ViewTypeExpected& h_A_syr, + const std::string& situation); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; const _AuxType _epsAbs; const _AuxType _epsRel; - int _M; - int _N; - bool _useAnalyticalResults; - bool _useHermitianOption; - bool _useUpOption; - bool _kkSyrShouldThrowException; - bool _kkGerShouldThrowException; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _useUpOption; + bool _kkSyrShouldThrowException; + bool _kkGerShouldThrowException; }; -template -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::SyrTester() - : _A_is_complex ( std::is_same>::value || std::is_same>::value ) - , _A_is_lr ( std::is_same< tLayoutA, Kokkos::LayoutRight >::value ) - , _A_is_ll ( std::is_same< tLayoutA, Kokkos::LayoutLeft >::value ) - , _testIsGpu ( KokkosKernels::Impl::kk_is_gpu_exec_space< typename Device::execution_space >() ) +template +SyrTester::SyrTester() + : _A_is_complex(std::is_same>::value || + std::is_same>::value), + _A_is_lr(std::is_same::value), + _A_is_ll(std::is_same::value), + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Device::execution_space>()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - , _vanillaUsesDifferentOrderOfOps( _A_is_lr ) + , + _vanillaUsesDifferentOrderOfOps(_A_is_lr) #else - , _vanillaUsesDifferentOrderOfOps( false ) + , + _vanillaUsesDifferentOrderOfOps(false) #endif - , _epsAbs (std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9) - , _epsRel (std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6) - , _M (-1) - , _N (-1) - , _useAnalyticalResults (false) - , _useHermitianOption (false) - , _useUpOption (false) - , _kkSyrShouldThrowException (false) - , _kkGerShouldThrowException (false) -{ + , + _epsAbs(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _epsRel(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _M(-1), + _N(-1), + _useAnalyticalResults(false), + _useHermitianOption(false), + _useUpOption(false), + _kkSyrShouldThrowException(false), + _kkGerShouldThrowException(false) { } -template -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::~SyrTester() -{ +template +SyrTester::~SyrTester() { // Nothing to do } -template -void SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::test( const int N - , const int nonConstConstCombinations - , const bool useAnalyticalResults - , const bool useHermitianOption - , const bool useUpOption - ) -{ - std::cout << "Entering SyrTester::test()... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; - - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps - << ", _epsAbs = " << _epsAbs - << ", _epsRel = " << _epsRel - << ", nonConstConstCombinations = " << nonConstConstCombinations - << ", useAnalyticalResults = " << useAnalyticalResults - << ", useHermitianOption = " << useHermitianOption - << ", useUpOption = " << useUpOption +template +void SyrTester::test( + const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults, const bool useHermitianOption, + const bool useUpOption) { + std::cout << "Entering SyrTester::test()... - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - " << std::endl; - + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " + << _vanillaUsesDifferentOrderOfOps << ", _epsAbs = " << _epsAbs + << ", _epsRel = " << _epsRel + << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults + << ", useHermitianOption = " << useHermitianOption + << ", useUpOption = " << useUpOption << std::endl; + // ******************************************************************** // Step 1 of 7: declare main types and variables // ******************************************************************** - _M = N; - _N = N; + _M = N; + _N = N; _useAnalyticalResults = useAnalyticalResults; _useHermitianOption = useHermitianOption; _useUpOption = useUpOption; @@ -256,15 +220,13 @@ void SyrTester< ScalarX } #endif - bool test_x (false); + bool test_x(false); bool test_cx(false); if (nonConstConstCombinations == 0) { test_x = true; - } - else if (nonConstConstCombinations == 1) { + } else if (nonConstConstCombinations == 1) { test_cx = true; - } - else { + } else { test_x = true; test_cx = true; } @@ -272,7 +234,8 @@ void SyrTester< ScalarX view_stride_adapter<_ViewTypeX, false> x("X", _M); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected( + "expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(0.); @@ -280,26 +243,19 @@ void SyrTester< ScalarX // ******************************************************************** // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A // ******************************************************************** - this->populateVariables( alpha - , x.h_view - , A.h_view - , h_expected.d_view - , x.d_view - , A.d_view - , expectedResultIsKnown - ); + this->populateVariables(alpha, x.h_view, A.h_view, h_expected.d_view, + x.d_view, A.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name() ); - this->populateVanillaValues( alpha - , x.h_view - , A.h_view - , h_vanilla.d_view - ); - + view_stride_adapter<_ViewTypeExpected, true> h_vanilla( + "vanilla = A + alpha * x * x^{t,h}", _M, _N); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); + this->populateVanillaValues(alpha, x.h_view, A.h_view, h_vanilla.d_view); + // ******************************************************************** // Step 4 of 7: use h_vanilla and h_expected as appropriate // ******************************************************************** @@ -307,18 +263,15 @@ void SyrTester< ScalarX // ****************************************************************** // Compare h_vanilla against h_expected // ****************************************************************** - this->compareVanillaAgainstExpected( alpha - , h_vanilla.d_view - , h_expected.d_view - ); - } - else { + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, + h_expected.d_view); + } else { // ****************************************************************** // Copy h_vanilla to h_expected // ****************************************************************** Kokkos::deep_copy(h_expected.d_base, h_vanilla.d_base); } - + // ******************************************************************** // Step 5 of 7: test with 'non const x' // ******************************************************************** @@ -326,23 +279,14 @@ void SyrTester< ScalarX Kokkos::deep_copy(org_A.d_base, A.d_base); if (test_x) { - this->callKkSyrAndCompareAgainstExpected( alpha - , x.d_view - , A.d_view - , A.h_view - , h_expected.d_view - , "non const x" - ); - - if ((_useAnalyticalResults == false) && // Just to save run time + this->callKkSyrAndCompareAgainstExpected( + alpha, x.d_view, A.d_view, A.h_view, h_expected.d_view, "non const x"); + + if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { Kokkos::deep_copy(org_A.h_view, org_A.d_base); - this->callKkGerAndCompareKkSyrAgainstIt( alpha - , x.d_view - , org_A.h_view - , A.h_view - , "non const x" - ); + this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A.h_view, + A.h_view, "non const x"); } } @@ -351,100 +295,87 @@ void SyrTester< ScalarX // ******************************************************************** if (test_cx) { Kokkos::deep_copy(A.d_base, org_A.d_base); - - this->callKkSyrAndCompareAgainstExpected( alpha - , x.d_view_const - , A.d_view - , A.h_view - , h_expected.d_view - , "const x" - ); + + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A.d_view, + A.h_view, h_expected.d_view, + "const x"); } // ******************************************************************** // Step 7 of 7: tests with invalid values on the first input parameter // ******************************************************************** - EXPECT_ANY_THROW( KokkosBlas::syr(".", "U", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for mode '.'"; - EXPECT_ANY_THROW( KokkosBlas::syr( "", "U", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for mode ''"; - EXPECT_ANY_THROW( KokkosBlas::syr("T", ".", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for uplo '.'"; - EXPECT_ANY_THROW( KokkosBlas::syr("T", "", alpha, x.d_view, A.d_view) ) << "Failed test: kk syr should have thrown an exception for uplo ''"; - - std::cout << "Leaving SyrTester::test() - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; + EXPECT_ANY_THROW(KokkosBlas::syr(".", "U", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW(KokkosBlas::syr("", "U", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for mode ''"; + EXPECT_ANY_THROW(KokkosBlas::syr("T", ".", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for uplo '.'"; + EXPECT_ANY_THROW(KokkosBlas::syr("T", "", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for uplo ''"; + + std::cout << "Leaving SyrTester::test() - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - " + << std::endl; } -template -void SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::populateVariables( ScalarA & alpha - , _HostViewTypeX & h_x - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - , _ViewTypeX & x - , _ViewTypeA & A - , bool & expectedResultIsKnown - ) -{ +template +void SyrTester::populateVariables( + ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, _ViewTypeX& x, _ViewTypeA& A, + bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues( alpha - , h_x - , h_A - , h_expected - ); + this->populateAnalyticalValues(alpha, h_x, h_A, h_expected); Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(A, h_A); expectedResultIsKnown = true; - } - else if (_N == 1) { + } else if (_N == 1) { alpha = 3; h_x[0] = 2; - h_A(0,0) = 7; + h_A(0, 0) = 7; Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(A, h_A); - h_expected(0,0) = 19; + h_expected(0, 0) = 19; expectedResultIsKnown = true; - } - else if (_N == 2) { + } else if (_N == 2) { alpha = 3; h_x[0] = -2; h_x[1] = 9; - h_A(0,0) = 17; - h_A(0,1) = -43; - h_A(1,0) = -43; - h_A(1,1) = 101; + h_A(0, 0) = 17; + h_A(0, 1) = -43; + h_A(1, 0) = -43; + h_A(1, 1) = 101; Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(A, h_A); if (_useUpOption) { - h_expected(0,0) = 29; - h_expected(0,1) = -97; - h_expected(1,0) = -43; - h_expected(1,1) = 344; - } - else { - h_expected(0,0) = 29; - h_expected(0,1) = -43; - h_expected(1,0) = -97; - h_expected(1,1) = 344; + h_expected(0, 0) = 29; + h_expected(0, 1) = -97; + h_expected(1, 0) = -43; + h_expected(1, 1) = 344; + } else { + h_expected(0, 0) = 29; + h_expected(0, 1) = -43; + h_expected(1, 0) = -97; + h_expected(1, 1) = 344; } expectedResultIsKnown = true; - } - else { + } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); { ScalarX randStart, randEnd; @@ -466,22 +397,21 @@ void SyrTester< ScalarX // Make h_A Hermitian // **************************************************************** for (int i(0); i < _N; ++i) { - for (int j(i+1); j < _N; ++j) { - h_A(i,j) = _KAT_A::conj( h_A(j,i) ); + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = _KAT_A::conj(h_A(j, i)); } } for (int i(0); i < _N; ++i) { - h_A(i,i) = 0.5 * ( h_A(i,i) + _KAT_A::conj( h_A(i,i) ) ); + h_A(i, i) = 0.5 * (h_A(i, i) + _KAT_A::conj(h_A(i, i))); } - } - else { + } else { // **************************************************************** // Make h_A symmetric // **************************************************************** for (int i(0); i < _N; ++i) { - for (int j(i+1); j < _N; ++j) { - h_A(i,j) = h_A(j,i); + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = h_A(j, i); } } } @@ -491,41 +421,34 @@ void SyrTester< ScalarX if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i,j) + std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i, j) << std::endl; } } } - } // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ) { +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { if (_useHermitianOption) { alpha.real() = 1.; alpha.imag() = 0.; - } - else { - alpha.real() = 1.; + } else { + alpha.real() = 1.; alpha.imag() = -1.; } for (int i = 0; i < _M; ++i) { - _AuxType auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); h_x[i].real() = sin(auxI); h_x[i].imag() = cos(auxI); } @@ -533,25 +456,25 @@ SyrTester< ScalarX if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - h_A(i,j).real() = cos(auxImJ); - h_A(i,j).imag() = -sin(auxImJ); - } - else { - h_A(i,j).real() = cos(auxImJ); - h_A(i,j).imag() = sin(auxImJ); + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_A(i, j).real() = cos(auxImJ); + h_A(i, j).imag() = -sin(auxImJ); + } else { + h_A(i, j).real() = cos(auxImJ); + h_A(i, j).imag() = sin(auxImJ); } } } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_A(i,j).real() = sin(auxIpJ) + cos(auxIpJ); - h_A(i,j).imag() = sin(auxIpJ) - cos(auxIpJ); + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = sin(auxIpJ) + cos(auxIpJ); + h_A(i, j).imag() = sin(auxIpJ) - cos(auxIpJ); } } } @@ -559,31 +482,30 @@ SyrTester< ScalarX if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); - h_expected(i,j).real() = 2. * cos(auxImJ); - h_expected(i,j).imag() = -2. * sin(auxImJ); - } - else { - h_expected(i,j).real() = h_A(i,j).real(); - h_expected(i,j).imag() = h_A(i,j).imag(); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j).real() = 2. * cos(auxImJ); + h_expected(i, j).imag() = -2. * sin(auxImJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); } } } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_expected(i,j).real() = 2. * sin(auxIpJ); - h_expected(i,j).imag() = 2. * sin(auxIpJ); - } - else { - h_expected(i,j).real() = h_A(i,j).real(); - h_expected(i,j).imag() = h_A(i,j).imag(); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j).real() = 2. * sin(auxIpJ); + h_expected(i, j).imag() = 2. * sin(auxIpJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); } } } @@ -591,123 +513,106 @@ SyrTester< ScalarX } // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ) { +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { alpha = 2; for (int i = 0; i < _M; ++i) { - _AuxType auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); - h_x[i] = sin(auxI); + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i] = sin(auxI); } for (int i = 0; i < _M; ++i) { - _AuxType auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - _AuxType auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - h_A(i,j) = 2 * cos(auxI) * cos(auxJ); + _AuxType auxJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_A(i, j) = 2 * cos(auxI) * cos(auxJ); } } for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); - h_expected(i,j) = 2 * cos(auxImJ); - } - else { - h_expected(i,j) = h_A(i,j); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j) = 2 * cos(auxImJ); + } else { + h_expected(i, j) = h_A(i, j); } } } } // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ) { +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - h_vanilla(i,j) = h_A(i,j) + alpha * _KAT_A::conj( h_x(j) ) * h_x(i); - } - else { - h_vanilla(i,j) = h_A(i,j); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * _KAT_A::conj(h_x(j)) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); } } } for (int i = 0; i < _N; ++i) { - h_vanilla(i,i).imag() = 0.; + h_vanilla(i, i).imag() = 0.; } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(j) * h_x(i); - } - else { - h_vanilla(i,j) = h_A(i,j); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); } } } } - } - else { + } else { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * _KAT_A::conj( h_x(j) ); - } - else { - h_vanilla(i,j) = h_A(i,j); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); } } } for (int i = 0; i < _N; ++i) { - h_vanilla(i,i).imag() = 0.; + h_vanilla(i, i).imag() = 0.; } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_x(j); - } - else { - h_vanilla(i,j) = h_A(i,j); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); } } } @@ -716,58 +621,45 @@ SyrTester< ScalarX } // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ) { +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(j) * h_x(i); - } - else { - h_vanilla(i,j) = h_A(i,j); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); } } } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_x(j); - } - else { - h_vanilla(i,j) = h_A(i,j); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); } } } } } -template +template template -T SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::shrinkAngleToZeroTwoPiRange(const T input) -{ +T SyrTester::shrinkAngleToZeroTwoPiRange(const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -782,790 +674,803 @@ T SyrTester< ScalarX } // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::compareVanillaAgainstExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ) { +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i,j) + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) << std::endl; } } } - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); if (_useAnalyticalResults) { - int numErrorsRealAbs (0); - int numErrorsRealRel (0); - int numErrorsImagAbs (0); - int numErrorsImagRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRealRel (0.); - int iForMaxErrorRealRel(0); - int jForMaxErrorRealRel(0); - _AuxType maxErrorImagRel (0.); - int iForMaxErrorImagRel(0); - int jForMaxErrorImagRel(0); + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; - if (h_expected(i,j).real() == 0.) { + if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); - if ( diff > diffThreshold ) { + if (diff > diffThreshold) { errorHappened = true; numErrorsRealAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); if (maxErrorRealRel < aux) { - maxErrorRealRel = aux; + maxErrorRealRel = aux; iForMaxErrorRealRel = i; jForMaxErrorRealRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); - if ( diff > diffThreshold ) { + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).real()); + if (diff > diffThreshold) { errorHappened = true; numErrorsRealRel++; } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i,j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() - << ", _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()) = " << diff - << ", diffThreshold = " << diffThreshold + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - " + "h_vanilla(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - diff = _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; - if (h_expected(i,j).imag() == 0.) { + if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); - if ( diff > diffThreshold ) { + if (diff > diffThreshold) { errorHappened = true; numErrorsImagAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); if (maxErrorImagRel < aux) { - maxErrorImagRel = aux; + maxErrorImagRel = aux; iForMaxErrorImagRel = i; jForMaxErrorImagRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); - if ( diff > diffThreshold ) { + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).imag()); + if (diff > diffThreshold) { errorHappened = true; numErrorsImagRel++; } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() - << ", _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()) = " << diff - << ", diffThreshold = " << diffThreshold + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - " + "h_vanilla(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i + } // for j + } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_vanilla(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_vanilla(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) + << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_vanilla(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_vanilla(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) + << "Failed test" << msg.str(); } - } - else { + } else { int numErrorsReal(0); int numErrorsImag(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - if ( h_expected(i,j).real() != h_vanilla(i,j).real() ) { + if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i,j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " + << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; } numErrorsReal++; } - if ( h_expected(i,j).imag() != h_vanilla(i,j).imag() ) { + if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " + << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; } numErrorsImag++; } - } // for j - } // for i - EXPECT_EQ(numErrorsReal, 0) << "Failed test" - << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) << "Failed test" - << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } - + // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::compareVanillaAgainstExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ) { +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i,j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i,j) + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) << std::endl; } } } - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); if (_useAnalyticalResults) { - int numErrorsAbs (0); - int numErrorsRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRel (0.); - int iForMaxErrorRel(0); - int jForMaxErrorRel(0); + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)); + diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); errorHappened = false; - if (h_expected(i,j) == 0.) { + if (h_expected(i, j) == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); if (maxErrorRel < aux) { - maxErrorRel = aux; + maxErrorRel = aux; iForMaxErrorRel = i; jForMaxErrorRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j)); if (diff > diffThreshold) { errorHappened = true; numErrorsRel++; } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i,j) - << ", h_vanilla(i,j) = " << h_vanilla(i,j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff - << ", diffThreshold = " << diffThreshold + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i + } // for j + } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", h_vanilla(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_vanilla(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } - } - else { + } else { int numErrors(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - if ( h_expected(i,j) != h_vanilla(i,j) ) { + if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i,j) - << ", h_vanilla(i,j) = " << h_vanilla(i,j) - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; } numErrors++; } - } // for j - } // for i - EXPECT_EQ(numErrors, 0) << "Failed test" - << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + } // for j + } // for i + EXPECT_EQ(numErrors, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } - + // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::compareKkSyrAgainstReference( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_reference - ) { +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester:: + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i,j) - << ", h_A(" << i << "," << j << ")=" << h_A(i,j) + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) << std::endl; } } } - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); - - int numErrorsRealAbs (0); - int numErrorsRealRel (0); - int numErrorsImagAbs (0); - int numErrorsImagRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRealRel (0.); - int iForMaxErrorRealRel(0); - int jForMaxErrorRealRel(0); - _AuxType maxErrorImagRel (0.); - int iForMaxErrorImagRel(0); - int jForMaxErrorImagRel(0); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()); + diff = _KAT_A::abs(h_reference(i, j).real() - h_A(i, j).real()); errorHappened = false; - if (h_reference(i,j).real() == 0.) { + if (h_reference(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsRealAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_reference(i,j).real()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).real()); if (maxErrorRealRel < aux) { - maxErrorRealRel = aux; + maxErrorRealRel = aux; iForMaxErrorRealRel = i; jForMaxErrorRealRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_reference(i,j).real()); + diffThreshold = _KAT_A::abs(_epsRel * h_reference(i, j).real()); if (diff > diffThreshold) { errorHappened = true; numErrorsRealRel++; } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_reference(i,j).real() = " << h_reference(i,j).real() - << ", h_A(i,j).real() = " << h_A(i,j).real() - << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " << diff - << ", diffThreshold = " << diffThreshold - << std::endl; + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - diff = _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()); + diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); errorHappened = false; - if (h_reference(i,j).imag() == 0.) { + if (h_reference(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsImagAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_reference(i,j).imag()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).imag()); if (maxErrorImagRel < aux) { - maxErrorImagRel = aux; + maxErrorImagRel = aux; iForMaxErrorImagRel = i; jForMaxErrorImagRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_reference(i,j).imag()); + diffThreshold = _KAT_A::abs(_epsRel * h_reference(i, j).imag()); if (diff > diffThreshold) { errorHappened = true; numErrorsImagRel++; } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_reference(i,j).imag() = " << h_reference(i,j).imag() - << ", h_A(i,j).imag() = " << h_A(i,j).imag() - << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " << diff - << ", diffThreshold = " << diffThreshold - << std::endl; + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i - - std::cout << "A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed - << std::endl; + } // for j + } // for i + + std::cout + << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(11, 2119) = (" << h_reference(11,2119).real() << ", " << h_reference(11,2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11,2119).real() << ", " << h_A(11,2119).imag() << ")" - << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ", " << h_reference(11, 2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " + << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(710, 1065) = (" << h_reference(710,1065).real() << ", " << h_reference(710,1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710,1065).real() << ", " << h_A(710,1065).imag() << ")" - << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ", " << h_reference(710, 1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " + << h_A(710, 1065).imag() << ")" << std::endl; } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } - + // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::compareKkSyrAgainstReference( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_reference - ) { +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester:: + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i,j) - << ", h_A(" << i << "," << j << ")=" << h_A(i,j) + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) << std::endl; } } } - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); - - int numErrorsAbs (0); - int numErrorsRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRel (0.); - int iForMaxErrorRel(0); - int jForMaxErrorRel(0); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_reference(i,j) - h_A(i,j)); + diff = _KAT_A::abs(h_reference(i, j) - h_A(i, j)); errorHappened = false; - if (h_reference(i,j) == 0.) { + if (h_reference(i, j) == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_reference(i,j)); + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j)); if (maxErrorRel < aux) { - maxErrorRel = aux; + maxErrorRel = aux; iForMaxErrorRel = i; jForMaxErrorRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_reference(i,j)); + diffThreshold = _KAT_A::abs(_epsRel * h_reference(i, j)); if (diff > diffThreshold) { errorHappened = true; numErrorsRel++; } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_reference(i,j) = " << h_reference(i,j) - << ", h_A(i,j) = " << h_A(i,j) + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff - << ", diffThreshold = " << diffThreshold - << std::endl; + << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i - std::cout << "A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() + } // for j + } // for i + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_reference(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed - << std::endl; + << ", _useUpOption = " << _useUpOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_reference(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } -template +template template -void SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::callKkSyrAndCompareAgainstExpected( const ScalarA & alpha - , TX & x - , _ViewTypeA & A - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - , const std::string & situation - ) -{ - std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkSyrShouldThrowException ); +void SyrTester:: + callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, + _ViewTypeA& A, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation) { + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha + << std::endl; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " + "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); std::string mode = _useHermitianOption ? "H" : "T"; std::string uplo = _useUpOption ? "U" : "L"; - bool gotStdException (false); + bool gotStdException(false); bool gotUnknownException(false); try { KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A); - } - catch( const std::exception& e ) { - std::cout << "In Test_Blas2_syr, '" << situation << "': caught exception, e.what() = " << e.what() << std::endl; + } catch (const std::exception& e) { + std::cout << "In Test_Blas2_syr, '" << situation + << "': caught exception, e.what() = " << e.what() << std::endl; gotStdException = true; - } - catch( ... ) { - std::cout << "In Test_Blas2_syr, '" << situation << "': caught unknown exception" << std::endl; + } catch (...) { + std::cout << "In Test_Blas2_syr, '" << situation + << "': caught unknown exception" << std::endl; gotUnknownException = true; } - EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation << "': unknown exception should not have happened"; + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened"; - EXPECT_EQ(gotStdException, _kkSyrShouldThrowException) << "Failed test, '" << situation << "': kk syr() should" - << (_kkSyrShouldThrowException ? " " : " not ") - << "have thrown a std::exception"; + EXPECT_EQ(gotStdException, _kkSyrShouldThrowException) + << "Failed test, '" << situation << "': kk syr() should" + << (_kkSyrShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; - if (( gotStdException == false ) && - ( gotUnknownException == false )) { + if ((gotStdException == false) && (gotUnknownException == false)) { Kokkos::deep_copy(h_A, A); - this->compareKkSyrAgainstReference( alpha - , h_A - , h_expected - ); + this->compareKkSyrAgainstReference(alpha, h_A, h_expected); } } -template +template template -void SyrTester< ScalarX - , tLayoutX - , ScalarA - , tLayoutA - , Device - >::callKkGerAndCompareKkSyrAgainstIt( const ScalarA & alpha - , TX & x - , _HostViewTypeA & h_A_orig - , const _ViewTypeExpected & h_A_syr - , const std::string & situation - ) -{ +void SyrTester:: + callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, + _HostViewTypeA& h_A_orig, + const _ViewTypeExpected& h_A_syr, + const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, h_A_orig); // ******************************************************************** // Call ger() // ******************************************************************** - std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException ); + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha + << std::endl; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " + "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkGerShouldThrowException); std::string mode = _useHermitianOption ? "H" : "T"; - bool gotStdException (false); + bool gotStdException(false); bool gotUnknownException(false); try { KokkosBlas::ger(mode.c_str(), alpha, x, x, A_ger.d_view); - } - catch( const std::exception& e ) { - std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught exception, e.what() = " << e.what() << std::endl; + } catch (const std::exception& e) { + std::cout << "In Test_Blas2_syr, '" << situation + << "', ger() call: caught exception, e.what() = " << e.what() + << std::endl; gotStdException = true; - } - catch( ... ) { - std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught unknown exception" << std::endl; + } catch (...) { + std::cout << "In Test_Blas2_syr, '" << situation + << "', ger() call: caught unknown exception" << std::endl; gotUnknownException = true; } - EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation << "': unknown exception should not have happened for ger() call"; + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call"; - EXPECT_EQ(gotStdException, false) << "Failed test, '" << situation << "': kk ger() should not have thrown a std::exception"; + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() should not have thrown a std::exception"; // ******************************************************************** // Prepare h_ger_reference to be compared against h_A_syr // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_ger_reference("h_ger_reference", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference( + "h_ger_reference", _M, _N); Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); std::string uplo = _useUpOption ? "U" : "L"; for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (( (_useUpOption == true ) && (i <= j) ) || - ( (_useUpOption == false) && (i >= j) )) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { // Keep h_ger_reference as already computed - } - else { - h_ger_reference.d_view(i,j) = h_A_orig(i,j); + } else { + h_ger_reference.d_view(i, j) = h_A_orig(i, j); } } } if (_useHermitianOption && _A_is_complex) { for (int i(0); i < _N; ++i) { - h_ger_reference.d_view(i,i) = 0.5 * ( h_ger_reference.d_view(i,i) + _KAT_A::conj( h_ger_reference.d_view(i,i) ) ); + h_ger_reference.d_view(i, i) = + 0.5 * (h_ger_reference.d_view(i, i) + + _KAT_A::conj(h_ger_reference.d_view(i, i))); } } // ******************************************************************** // Compare // ******************************************************************** - this->compareKkSyrAgainstReference( alpha - , h_A_syr - , h_ger_reference.d_view - ); + this->compareKkSyrAgainstReference(alpha, h_A_syr, h_ger_reference.d_view); } -} // namespace Test +} // namespace Test template -int test_syr( const std::string & caseName ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s ...\n", caseName.c_str() ); +int test_syr(const std::string& caseName) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); bool xBool = std::is_same::value || std::is_same::value || @@ -1578,12 +1483,18 @@ int test_syr( const std::string & caseName ) { bool useAnalyticalResults = xBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTLEFT ...\n", caseName.c_str() ); + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", + caseName.c_str()); if (true) { - Test::SyrTester tester; + Test::SyrTester + tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1591,34 +1502,43 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0); if (useAnalyticalResults) { - tester.test(1024, 0 , true, false, false); - tester.test(1024, 0 , true, false, true); - tester.test(1024, 0 , true, true, false); - tester.test(1024, 0 , true, true, true); + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); } - tester.test(2, 0 , false, false, true); - tester.test(50, 0 , false, false, true); - tester.test(2, 0 , false, true, false); - tester.test(50, 0 , false, true, false); - tester.test(2, 0 , false, true, true); - tester.test(50, 0 , false, true, true); + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); - tester.test(50, 4 ); + tester.test(50, 4); tester.test(2131, 0); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTLEFT\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTRIGHT ...\n", caseName.c_str() ); + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", + caseName.c_str()); if (true) { - Test::SyrTester tester; + Test::SyrTester + tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1632,28 +1552,37 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0, true, true, true); } - tester.test(2, 0, false, false, true); + tester.test(2, 0, false, false, true); tester.test(50, 0, false, false, true); - tester.test(2, 0, false, true, false); + tester.test(2, 0, false, true, false); tester.test(50, 0, false, true, false); - tester.test(2, 0, false, true, true); + tester.test(2, 0, false, true, true); tester.test(50, 0, false, true, true); tester.test(50, 4); tester.test(2131, 0); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTRIGHT\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str() ); + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", + caseName.c_str()); if (true) { - Test::SyrTester tester; + Test::SyrTester + tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1667,27 +1596,36 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0, true, true, true); } - tester.test(2, 0, false, false, true); + tester.test(2, 0, false, false, true); tester.test(50, 0, false, false, true); - tester.test(2, 0, false, true, false); + tester.test(2, 0, false, true, false); tester.test(50, 0, false, true, false); - tester.test(2, 0, false, true, true); + tester.test(2, 0, false, true, true); tester.test(50, 0, false, true, true); tester.test(50, 4); tester.test(2131, 0); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTSTRIDE\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for MIXED LAYOUTS ...\n", caseName.c_str() ); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", + caseName.c_str()); if (true) { - Test::SyrTester tester; + Test::SyrTester + tester; tester.test(1, 0); tester.test(2, 0); tester.test(1024, 0); @@ -1697,68 +1635,82 @@ int test_syr( const std::string & caseName ) { tester.test(1024, 0, true, true, true); } - tester.test(2, 0, false, false, true); + tester.test(2, 0, false, false, true); tester.test(50, 0, false, false, true); - tester.test(2, 0, false, true, true); + tester.test(2, 0, false, true, true); tester.test(50, 0, false, true, true); } if (true) { - Test::SyrTester tester; + Test::SyrTester + tester; tester.test(1024, 0); } - - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for MIXED LAYOUTS\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_float"); - test_syr( "test case syr_float" ); + test_syr("test case syr_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_float"); - test_syr, Kokkos::complex, TestExecSpace>( "test case syr_complex_float" ); + test_syr, Kokkos::complex, TestExecSpace>( + "test case syr_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double"); - test_syr( "test case syr_double" ); + test_syr("test case syr_double"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_double"); - test_syr, Kokkos::complex, TestExecSpace>( "test case syr_complex_double" ); + test_syr, Kokkos::complex, TestExecSpace>( + "test case syr_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int"); - test_syr( "test case syr_int" ); + test_syr("test case syr_int"); Kokkos::Profiling::popRegion(); } #endif @@ -1767,7 +1719,7 @@ TEST_F(TestCategory, syr_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double_int"); - test_syr( "test case syr_mixed_types" ); + test_syr("test case syr_mixed_types"); Kokkos::Profiling::popRegion(); } #endif From 84c88103721e7cd2ead39e2d84d7fe9dd7a488ff Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 13:38:08 -0600 Subject: [PATCH 038/231] Documentation --- docs/developer/apidocs/blas2.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst index 20dbc5ea9a..9d96567929 100644 --- a/docs/developer/apidocs/blas2.rst +++ b/docs/developer/apidocs/blas2.rst @@ -10,3 +10,8 @@ ger ---- .. doxygenfunction:: KokkosBlas::ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) .. doxygenfunction:: KokkosBlas::ger(const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) + +syr +---- +.. doxygenfunction:: KokkosBlas::syr(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) +.. doxygenfunction:: KokkosBlas::syr(const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) From 92b5272ff5294e8d24b05d228a861d071f4631ab Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 13:42:39 -0600 Subject: [PATCH 039/231] Formatting --- blas/src/KokkosBlas1_swap.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index ea864cc3b4..cd976b8676 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -41,7 +41,7 @@ namespace KokkosBlas { /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking template -void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) { // AquiEEP +void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) { // Assert properties of XVector static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); @@ -93,7 +93,7 @@ void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) Kokkos::Profiling::pushRegion("KokkosBlas::swap"); // If X.extent(0) == 0, do nothing if (X.extent(0) != 0) { -#if 0 // AquiEEP +#if 0 // AquiEEP Impl::Swap::swap(space, X, Y); #endif From 6bec18cce6ee592cabbf4ea8c2136de2eb0e2391 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 13:47:06 -0600 Subject: [PATCH 040/231] Backup --- blas/tpls/KokkosBlas_Host_tpl.cpp | 130 +++++++----------------------- blas/tpls/KokkosBlas_Host_tpl.hpp | 38 +++------ 2 files changed, 40 insertions(+), 128 deletions(-) diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index d8183d7ab9..b85f6109e8 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -252,22 +252,10 @@ void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, /// /// Syr /// -void F77_BLAS_MANGLE(ssyr, SSYR)( const char* - , int* - , const float* - , const float* - , int* - , float* - , int* - ); -void F77_BLAS_MANGLE(dsyr, DSYR)( const char* - , int* - , const double* - , const double* - , int* - , double* - , int* - ); +void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, int*, const float*, const float*, + int*, float*, int*); +void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, int*, const double*, + const double*, int*, double*, int*); // Although there is a cgeru, there is no csyru // Although there is a zgeru, there is no zsyru // Although there is a cgerc, there is no csyrc, but there is cher (see below) @@ -277,22 +265,12 @@ void F77_BLAS_MANGLE(dsyr, DSYR)( const char* /// Her /// -void F77_BLAS_MANGLE(cher, CHER)( const char* - , int* - , const float* - , const std::complex* - , int* - , std::complex* - , int* - ); -void F77_BLAS_MANGLE(zher, ZHER)( const char* - , int* - , const double* - , const std::complex* - , int* - , std::complex* - , int* - ); +void F77_BLAS_MANGLE(cher, CHER)(const char*, int*, const float*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zher, ZHER)(const char*, int*, const double*, + const std::complex*, int*, + std::complex*, int*); /// /// Trsv @@ -515,11 +493,11 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) #define F77_FUNC_ZGERC F77_BLAS_MANGLE(zgerc, ZGERC) -#define F77_FUNC_SSYR F77_BLAS_MANGLE(ssyr, SSYR) -#define F77_FUNC_DSYR F77_BLAS_MANGLE(dsyr, DSYR) +#define F77_FUNC_SSYR F77_BLAS_MANGLE(ssyr, SSYR) +#define F77_FUNC_DSYR F77_BLAS_MANGLE(dsyr, DSYR) -#define F77_FUNC_CHER F77_BLAS_MANGLE(cher, CHER) -#define F77_FUNC_ZHER F77_BLAS_MANGLE(zher, ZHER) +#define F77_FUNC_CHER F77_BLAS_MANGLE(cher, CHER) +#define F77_FUNC_ZHER F77_BLAS_MANGLE(zher, ZHER) #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) @@ -628,22 +606,9 @@ void HostBlas::ger(int m, int n, const float alpha, const float* x, F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr( const char uplo - , int n - , const float alpha - , const float* x - , int incx - , float* a - , int lda - ) { - F77_FUNC_SSYR( &uplo - , &n - , &alpha - , x - , &incx - , a - , &lda - ); +void HostBlas::syr(const char uplo, int n, const float alpha, + const float* x, int incx, float* a, int lda) { + F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, @@ -765,22 +730,9 @@ void HostBlas::ger(int m, int n, const double alpha, const double* x, F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr( const char uplo - , int n - , const double alpha - , const double* x - , int incx - , double* a - , int lda - ) { - F77_FUNC_DSYR( &uplo - , &n - , &alpha - , x - , &incx - , a - , &lda - ); +void HostBlas::syr(const char uplo, int n, const double alpha, + const double* x, int incx, double* a, int lda) { + F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, @@ -930,22 +882,11 @@ void HostBlas >::gerc( } template <> template <> -void HostBlas >::cher( const char uplo - , int n - , const float alpha - , const std::complex* x - , int incx - , std::complex* a - , int lda - ) { - F77_FUNC_CHER( &uplo - , &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); +void HostBlas >::cher( + const char uplo, int n, const float alpha, const std::complex* x, + int incx, std::complex* a, int lda) { + F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (std::complex*)a, &lda); } template <> void HostBlas >::trsv(const char uplo, const char transa, @@ -1119,22 +1060,11 @@ void HostBlas >::gerc( } template <> template <> -void HostBlas >::zher( const char uplo - , int n - , const double alpha - , const std::complex* x - , int incx - , std::complex* a - , int lda - ) { - F77_FUNC_ZHER( &uplo - , &n - , &alpha - , (const std::complex*)x - , &incx - , (std::complex*)a - , &lda - ); +void HostBlas >::zher( + const char uplo, int n, const double alpha, const std::complex* x, + int incx, std::complex* a, int lda) { + F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (std::complex*)a, &lda); } template <> void HostBlas >::trsv(const char uplo, const char transa, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 8b6391c92d..6f6c34dc25 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -73,34 +73,16 @@ struct HostBlas { static void gerc(int m, int n, const T alpha, const T *x, int incx, const T *y, int incy, T *a, int lda); - static void syr( const char uplo - , int n - , const T alpha - , const T* x - , int incx - , T* a - , int lda - ); - - template< typename tAlpha > - static void cher( const char uplo - , int n - , const tAlpha alpha - , const T* x - , int incx - , T* a - , int lda - ); - - template< typename tAlpha > - static void zher( const char uplo - , int n - , const tAlpha alpha - , const T* x - , int incx - , T* a - , int lda - ); + static void syr(const char uplo, int n, const T alpha, const T *x, int incx, + T *a, int lda); + + template + static void cher(const char uplo, int n, const tAlpha alpha, const T *x, + int incx, T *a, int lda); + + template + static void zher(const char uplo, int n, const tAlpha alpha, const T *x, + int incx, T *a, int lda); static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, From 00b777e046149ca6a92398ed93f353796bdc551c Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 13:48:47 -0600 Subject: [PATCH 041/231] Formatting --- blas/src/KokkosBlas1_swap.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index cd976b8676..dbd1579edd 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -41,7 +41,8 @@ namespace KokkosBlas { /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking template -void swap(execution_space const& /*space*/, XVector const& x, YVector const& y) { +void swap(execution_space const& /*space*/, XVector const& x, + YVector const& y) { // Assert properties of XVector static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); From aeb6f7e9b79cef77ddfaffcf9f146f3370a29730 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 19 May 2023 14:45:22 -0600 Subject: [PATCH 042/231] Returning to the swap version that is in develop --- blas/src/KokkosBlas1_swap.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index dbd1579edd..f91d090cd5 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -41,8 +41,7 @@ namespace KokkosBlas { /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking template -void swap(execution_space const& /*space*/, XVector const& x, - YVector const& y) { +void swap(execution_space const& space, XVector const& x, YVector const& y) { // Assert properties of XVector static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); @@ -94,10 +93,8 @@ void swap(execution_space const& /*space*/, XVector const& x, Kokkos::Profiling::pushRegion("KokkosBlas::swap"); // If X.extent(0) == 0, do nothing if (X.extent(0) != 0) { -#if 0 // AquiEEP Impl::Swap::swap(space, X, Y); -#endif } Kokkos::Profiling::popRegion(); } From c2a10b91a56b59af8f8548988eff73890bb390cb Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 22 May 2023 20:06:25 -0600 Subject: [PATCH 043/231] Sync with develop --- blas/impl/KokkosBlas2_syr_spec.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 37e173f6e0..845ba26f77 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -133,6 +133,5 @@ struct SYR { false, true>; #include -#include #endif // KOKKOSBLAS2_SYR_SPEC_HPP_ From 29ed900a2ca7195768e9454dba36ac076a4806ba Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 23 May 2023 00:05:31 -0600 Subject: [PATCH 044/231] Correcting compilation errors at blake --- blas/unit_test/Test_Blas2_syr.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 00a58bdd86..5ea8a02144 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -126,7 +126,7 @@ class SyrTester { template void callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, - _HostViewTypeA& h_A_orig, + view_stride_adapter<_ViewTypeA, false>& org_A, const _ViewTypeExpected& h_A_syr, const std::string& situation); @@ -277,6 +277,7 @@ void SyrTester::test( // ******************************************************************** view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); Kokkos::deep_copy(org_A.d_base, A.d_base); + Kokkos::deep_copy(org_A.h_view, A.h_view); if (test_x) { this->callKkSyrAndCompareAgainstExpected( @@ -284,8 +285,7 @@ void SyrTester::test( if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { - Kokkos::deep_copy(org_A.h_view, org_A.d_base); - this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A.h_view, + this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, A.h_view, "non const x"); } } @@ -1392,11 +1392,11 @@ template void SyrTester:: callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, - _HostViewTypeA& h_A_orig, + view_stride_adapter<_ViewTypeA, false>& org_A, const _ViewTypeExpected& h_A_syr, const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); - Kokkos::deep_copy(A_ger.d_base, h_A_orig); + Kokkos::deep_copy(A_ger.d_base, org_A.d_base); // ******************************************************************** // Call ger() @@ -1445,7 +1445,7 @@ void SyrTester:: ((_useUpOption == false) && (i >= j))) { // Keep h_ger_reference as already computed } else { - h_ger_reference.d_view(i, j) = h_A_orig(i, j); + h_ger_reference.d_view(i, j) = org_A.h_view(i, j); } } } From d99df7866dbe9b1b0f325d50384d65221ef08d36 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 23 May 2023 00:12:12 -0600 Subject: [PATCH 045/231] Formatting --- blas/unit_test/Test_Blas2_syr.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 5ea8a02144..1155a1898f 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -125,10 +125,10 @@ class SyrTester { const std::string& situation); template - void callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _ViewTypeExpected& h_A_syr, - const std::string& situation); + void callKkGerAndCompareKkSyrAgainstIt( + const ScalarA& alpha, TX& x, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _ViewTypeExpected& h_A_syr, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -285,8 +285,8 @@ void SyrTester::test( if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { - this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, - A.h_view, "non const x"); + this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, A.h_view, + "non const x"); } } @@ -1391,10 +1391,10 @@ template template void SyrTester:: - callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _ViewTypeExpected& h_A_syr, - const std::string& situation) { + callKkGerAndCompareKkSyrAgainstIt( + const ScalarA& alpha, TX& x, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _ViewTypeExpected& h_A_syr, const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, org_A.d_base); From 66244bf412940c931d1c661e26a3a881fc8d2aed Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 23 May 2023 01:03:25 -0600 Subject: [PATCH 046/231] Typo --- blas/src/KokkosBlas2_syr.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index fb36b3acf0..87a5aebca0 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -36,7 +36,7 @@ namespace KokkosBlas { /// this routine will always compute either the lower portion or the /// upper portion (per user's request) of the final matrix A. So, in /// order to obtain meaningful results, the user must make sure to -/// follow the conditions specied in the "important note 1" above. +/// follow the conditions specified in the "important note 1" above. /// /// Important note 3: if TPL is enabled, this routine will call the /// third party library BLAS routines whenever the parameters passed @@ -159,7 +159,7 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], /// this routine will always compute either the lower portion or the /// upper portion (per user's request) of the final matrix A. So, in /// order to obtain meaningful results, the user must make sure to -/// follow the conditions specied in the "important note 1" above. +/// follow the conditions specified in the "important note 1" above. /// /// Important note 3: if TPL is enabled, this routine will call the /// third party library BLAS routines whenever the parameters passed From 9e8757fb19d769b12cb95bc3d435ad0771025667 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 23 May 2023 12:39:15 -0600 Subject: [PATCH 047/231] Removing a file that is not needed anymore --- .../KokkosBlas2_syr_eti_spec_decl.hpp.in | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in deleted file mode 100644 index 4b66faf5b2..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in +++ /dev/null @@ -1,25 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ - -namespace KokkosBlas { -namespace Impl { -@BLAS2_SYR_ETI_DECL_BLOCK@ -} //IMPL -} //Kokkos -#endif From 7c97a0d0c721e74e44bd04f5d93d200f8ee71d4d Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 23 May 2023 13:11:13 -0600 Subject: [PATCH 048/231] Removing PRINTF from any syr() code that is not in the unit test code --- blas/impl/KokkosBlas2_syr_impl.hpp | 13 ------------- blas/src/KokkosBlas2_syr.hpp | 4 ---- 2 files changed, 17 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 82966de639..5388829030 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -88,10 +88,6 @@ void singleLevelSyr(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Entering IMPL singleLevelSyr(), AViewType = %s\n", - typeid(AViewType).name()); - static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -226,9 +222,6 @@ void twoLevelSyr(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Entering IMPL twoLevelSyr(), AViewType = %s\n", - typeid(AViewType).name()); - static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -277,9 +270,6 @@ void generalSyrImpl(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Entering IMPL generalSyrImpl(CPU), AViewType = %s\n", - typeid(AViewType).name()); singleLevelSyr(space, trans, uplo, alpha, x, A); } @@ -291,9 +281,6 @@ void generalSyrImpl(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Entering IMPL generalSyrImpl(GPU), AViewType = %s\n", - typeid(AViewType).name()); twoLevelSyr(space, trans, uplo, alpha, x, A); } diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 87a5aebca0..1fe96aad9c 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -64,10 +64,6 @@ template void syr(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Entering SRC KokkosBlas::syr(), AViewType = %s\n", - typeid(AViewType).name()); - static_assert( Kokkos::SpaceAccessibility::assignable, From 3d0fc4dc51957e9ad0a4b5710ff528a1b6c822e3 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 5 Jun 2023 18:48:18 -0600 Subject: [PATCH 049/231] Addressing feedbacks from Carl --- blas/impl/KokkosBlas2_syr_impl.hpp | 227 ++++++++---------- blas/impl/KokkosBlas2_syr_spec.hpp | 47 +++- blas/src/KokkosBlas2_syr.hpp | 24 +- blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp | 2 +- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 12 +- blas/unit_test/Test_Blas2_syr.hpp | 94 +++++--- 6 files changed, 228 insertions(+), 178 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 5388829030..e31e3672ac 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -25,45 +25,41 @@ namespace KokkosBlas { namespace Impl { -// Functor for a single-level parallel_for version of nontranspose SYR. -// The functor parallelizes over rows of the input matrix A. -template -struct SingleLevelSYR { +// Functor for the thread parallel version of SYR. +// This functor parallelizes over rows of the input matrix A. +template +struct ThreadParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - SingleLevelSYR(const bool justTranspose, const bool justUp, - const AlphaCoeffType& alpha, const XViewType& x, - const AViewType& A) - : justTranspose_(justTranspose), - justUp_(justUp), - alpha_(alpha), + ThreadParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, + const AViewType& A) + : alpha_(alpha), x_(x), A_(A) { // Nothing to do } KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { - if (alpha_ == Kokkos::ArithTraits::zero()) { - // Nothing to do - } else if (x_(i) == Kokkos::ArithTraits::zero()) { + // Condition 'alpha_ == zero' has already been checked. + if (x_(i) == Kokkos::ArithTraits::zero()) { // Nothing to do } else { const XComponentType x_fixed(x_(i)); const IndexType N(A_.extent(1)); - if (justTranspose_) { + if constexpr(tJustTranspose) { for (IndexType j = 0; j < N; ++j) { - if (((justUp_ == true) && (i <= j)) || - ((justUp_ == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); } } } else { for (IndexType j = 0; j < N; ++j) { - if (((justUp_ == true) && (i <= j)) || - ((justUp_ == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj(x_(j))); @@ -74,20 +70,18 @@ struct SingleLevelSYR { } private: - bool justTranspose_; - bool justUp_; AlphaCoeffType alpha_; typename XViewType::const_type x_; AViewType A_; }; -// Single-level parallel version of SYR. +// Thread parallel version of SYR. template -void singleLevelSyr(const ExecutionSpace& space, const char trans[], - const char uplo[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { + class IndexType, + bool tJustTranspose, bool tJustUp> +void threadParallelSyr(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -100,23 +94,22 @@ void singleLevelSyr(const ExecutionSpace& space, const char trans[], } else { Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); - SingleLevelSYR functor( - (trans[0] == 'T') || (trans[0] == 't'), - (uplo[0] == 'U') || (uplo[0] == 'u'), alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[SingleLevel]", rangePolicy, functor); + ThreadParallelSYR functor(alpha, x, A); + Kokkos::parallel_for("KokkosBlas::syr[thredParallel]", rangePolicy, functor); } } -struct TwoLevelSYR_LayoutLeftTag {}; -struct TwoLevelSYR_LayoutRightTag {}; +struct TeamParallelSYR_LayoutLeftTag {}; +struct TeamParallelSYR_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- -// Functor for a two-level parallel_reduce version of SYR, designed for -// performance on GPU. Kernel depends on the layout of A. +// Functor for the team parallel version of SYR, designed for +// performance on GPUs. The kernel depends on the layout of A. template -struct TwoLevelSYR { + class IndexType, + bool tJustTranspose, bool tJustUp> +struct TeamParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; @@ -124,12 +117,9 @@ struct TwoLevelSYR { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TwoLevelSYR(const bool justTranspose, const bool justUp, - const AlphaCoeffType& alpha, const XViewType& x, - const AViewType& A) - : justTranspose_(justTranspose), - justUp_(justUp), - alpha_(alpha), + TeamParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, + const AViewType& A) + : alpha_(alpha), x_(x), A_(A) { // Nothing to do @@ -137,91 +127,82 @@ struct TwoLevelSYR { public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelSYR_LayoutLeftTag, + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutLeftTag, const member_type& team) const { - if (alpha_ == Kokkos::ArithTraits::zero()) { + // Condition 'alpha_ == zero' has already been checked + const IndexType j(team.league_rank()); + if (x_(j) == Kokkos::ArithTraits::zero()) { // Nothing to do } else { - const IndexType j(team.league_rank()); - if (x_(j) == Kokkos::ArithTraits::zero()) { - // Nothing to do + const IndexType M(A_.extent(0)); + if constexpr(tJustTranspose) { + const XComponentType x_fixed(x_(j)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); } else { - const IndexType M(A_.extent(0)); - if (justTranspose_) { - const XComponentType x_fixed(x_(j)); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((justUp_ == true) && (i <= j)) || - ((justUp_ == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); - } - }); - } else { - const XComponentType x_fixed( - Kokkos::ArithTraits::conj(x_(j))); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((justUp_ == true) && (i <= j)) || - ((justUp_ == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); - } - }); - } + const XComponentType x_fixed( + Kokkos::ArithTraits::conj(x_(j))); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); } } } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelSYR_LayoutRightTag, + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutRightTag, const member_type& team) const { - if (alpha_ == Kokkos::ArithTraits::zero()) { + // Condition 'alpha_ == zero' has already been checked + const IndexType i(team.league_rank()); + if (x_(i) == Kokkos::ArithTraits::zero()) { // Nothing to do } else { - const IndexType i(team.league_rank()); - if (x_(i) == Kokkos::ArithTraits::zero()) { - // Nothing to do + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); + if constexpr(tJustTranspose) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); + } + }); } else { - const IndexType N(A_.extent(1)); - const XComponentType x_fixed(x_(i)); - if (justTranspose_) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((justUp_ == true) && (i <= j)) || - ((justUp_ == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); - } - }); - } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((justUp_ == true) && (i <= j)) || - ((justUp_ == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(x_(j))); - } - }); - } + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); } } - team.team_barrier(); } private: - bool justTranspose_; - bool justUp_; AlphaCoeffType alpha_; typename XViewType::const_type x_; AViewType A_; }; -// Two-level parallel version of SYR. +// Team parallel version of SYR. template -void twoLevelSyr(const ExecutionSpace& space, const char trans[], - const char uplo[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { + class IndexType, + bool tJustTranspose, bool tJustUp> +void teamParallelSyr(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -236,10 +217,10 @@ void twoLevelSyr(const ExecutionSpace& space, const char trans[], } constexpr bool isLayoutLeft = - std::is_same::value; + std::is_same_v; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -250,38 +231,36 @@ void twoLevelSyr(const ExecutionSpace& space, const char trans[], teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TwoLevelSYR functor( - (trans[0] == 'T') || (trans[0] == 't'), - (uplo[0] == 'U') || (uplo[0] == 'u'), alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[twoLevel]", teamPolicy, functor); + TeamParallelSYR functor(alpha, x, A); + Kokkos::parallel_for("KokkosBlas::syr[teamParallel]", teamPolicy, functor); } // --------------------------------------------------------------------------------------------- -// generalSyr: use 1 level (Range) or 2 level (Team) implementation, -// depending on whether execution space is CPU or GPU. +// generalSyrImpl(): +// - use thread parallel code (rangePolicy) if execution space is CPU; +// - use team parallel code (teamPolicy) if execution space is GPU. +// // The 'enable_if' makes sure unused kernels are not instantiated. template ()>::type* = nullptr> -void generalSyrImpl(const ExecutionSpace& space, const char trans[], - const char uplo[], + class IndexType, bool tJustTranspose, bool tJustUp, + typename std::enable_if()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - singleLevelSyr(space, trans, uplo, alpha, x, A); + threadParallelSyr< ExecutionSpace, XViewType, AViewType, IndexType + , tJustTranspose, tJustUp>(space, alpha, x, A); } template ()>::type* = nullptr> -void generalSyrImpl(const ExecutionSpace& space, const char trans[], - const char uplo[], + class IndexType, bool tJustTranspose, bool tJustUp, + typename std::enable_if()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - twoLevelSyr(space, trans, uplo, alpha, x, A); + teamParallelSyr< ExecutionSpace, XViewType, AViewType, IndexType + , tJustTranspose, tJustUp>(space, alpha, x, A); } } // namespace Impl diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 845ba26f77..6bc69d8cb9 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -84,14 +84,53 @@ struct SYR { const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); + bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); + // Prefer int as the index type, but use a larsyr type if needed. if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { - generalSyrImpl( - space, trans, uplo, alpha, x, A); + if (justTranspose) { + if (justUp) { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int + , true, true>(space, alpha, x, A); + } + else { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int + , true, false>(space, alpha, x, A); + } + } + else { + if (justUp) { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int + , false, true>(space, alpha, x, A); + } + else { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int + , false, false>(space, alpha, x, A); + } + } } else { - generalSyrImpl( - space, trans, uplo, alpha, x, A); + if (justTranspose) { + if (justUp) { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t + , true, true>(space, alpha, x, A); + } + else { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t + , true, false>(space, alpha, x, A); + } + } + else { + if (justUp) { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t + , false, true>(space, alpha, x, A); + } + else { + generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t + , false, false>(space, alpha, x, A); + } + } } Kokkos::Profiling::popRegion(); diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 1fe96aad9c..7c0807618e 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -103,7 +103,7 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], } else { std::ostringstream os; os << "KokkosBlas2::syr(): invalid trans[0] = '" << trans[0] - << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + << "'. It must be equal to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -113,7 +113,7 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], } else { std::ostringstream oss; oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] - << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; + << "'. It must be equal to 'U' or 'u' or 'L' or 'l'"; throw std::runtime_error(oss.str()); } @@ -125,17 +125,15 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], // Minimize the number of Impl::SYR instantiations, by standardizing // on particular View specializations for its template parameters. - typedef Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits > - XVT; - - typedef Kokkos::View > - AVT; + using XVT = Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits >; + + using AVT = Kokkos::View >; Impl::SYR::syr(space, trans, uplo, alpha, x, A); } diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp index 69b90e85bf..f537b3854a 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct syr_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index c90ff729e5..83c8fd78d7 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -65,7 +65,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr( \ - s.handle, uplo, N, &alpha, X.data(), one, A.data(), LDA)); \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ @@ -109,7 +109,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr( \ - s.handle, uplo, N, &alpha, X.data(), one, A.data(), LDA)); \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ @@ -156,7 +156,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr( \ - s.handle, uplo, N, \ + s.handle, fillMode, N, \ reinterpret_cast(&alpha), \ reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ @@ -174,7 +174,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher( \ - s.handle, uplo, N, &alpha_val, \ + s.handle, fillMode, N, &alpha_val, \ reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ @@ -225,7 +225,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr( \ - s.handle, uplo, N, \ + s.handle, fillMode, N, \ reinterpret_cast(&alpha), \ reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ @@ -243,7 +243,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher( \ - s.handle, uplo, N, &alpha_val, \ + s.handle, fillMode, N, &alpha_val, \ reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 1155a1898f..9e686b5dca 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -14,6 +14,35 @@ // //@HEADER +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operation A += alpha * x * x^{T,H}: +// 01) Type of 'x' components: float, double, complex, ... +// 02) Type of 'A' components: float, double, complex, ... +// 03) Execution space: serial, threads, OpenMP, Cuda, ... +// 04) Layout of 'x' +// 05) Layout of 'A' +// 06) Dimension of 'A' +// 07) Options 'const' or 'non const' for x view, when calling syr() +// 08) Usage of analytical results in the tests +// 09) Options 'T' or 'H' when calling syr() +// 10) Options 'U' or 'L' when calling syr() +// +// Choices (01)-(03) are selected in the routines TEST_F() at the +// very bottom of the file, when calling test_syr<...>(). +// +// Choices (04)-(10) are selected in routine test_syr<...>(), +// when calling the method test() of class Test::SyrTester<...>. +// +// The class Test::SyrTester<...> represents the "core" of the test +// logic, where all calculations, comparisons, and success/failure +// decisions are performed. +// +// A high level explanation of method Test::SyrTester<...>::test() +// is given by the 7 steps named "Step 1 of 7" to "Step 7 of 7" +// in the code below. +// ********************************************************************** + #include #include #include @@ -22,8 +51,6 @@ namespace Test { -constexpr double piVal = 3.14159265358979323846; - template class SyrTester { @@ -38,16 +65,15 @@ class SyrTester { const bool useUpOption = false); private: - typedef Kokkos::View _ViewTypeX; - typedef Kokkos::View _ViewTypeA; + using _ViewTypeX = Kokkos::View; + using _ViewTypeA = Kokkos::View; - typedef typename _ViewTypeX::HostMirror _HostViewTypeX; - typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View - _ViewTypeExpected; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; - typedef Kokkos::ArithTraits _KAT_A; - typedef typename _KAT_A::mag_type _AuxType; + using _KAT_A = Kokkos::ArithTraits; + using _AuxType = typename _KAT_A::mag_type; void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected, @@ -135,8 +161,8 @@ class SyrTester { const bool _A_is_ll; const bool _testIsGpu; const bool _vanillaUsesDifferentOrderOfOps; - const _AuxType _epsAbs; - const _AuxType _epsRel; + const _AuxType _absTol; + const _AuxType _relTol; int _M; int _N; bool _useAnalyticalResults; @@ -163,8 +189,16 @@ SyrTester::SyrTester() _vanillaUsesDifferentOrderOfOps(false) #endif , - _epsAbs(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), - _epsRel(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + // **************************************************************** + // Tolerances for double can be tighter than tolerances for float. + // + // In the case of calculations with float, a small amount of + // discrepancies between reference results and CUDA results are + // large enough to require 'relTol' to value 5.0e-3. The same + // calculations show no discrepancies for calculations with double. + // **************************************************************** + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -195,8 +229,8 @@ void SyrTester::test( << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", _testIsGpu = " << _testIsGpu << ", _vanillaUsesDifferentOrderOfOps = " - << _vanillaUsesDifferentOrderOfOps << ", _epsAbs = " << _epsAbs - << ", _epsRel = " << _epsRel + << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol << ", nonConstConstCombinations = " << nonConstConstCombinations << ", useAnalyticalResults = " << useAnalyticalResults << ", useHermitianOption = " << useHermitianOption @@ -238,7 +272,7 @@ void SyrTester::test( "expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; - ScalarA alpha(0.); + ScalarA alpha(_KAT_A::zero()); // ******************************************************************** // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A @@ -717,7 +751,7 @@ SyrTester:: diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; if (h_expected(i, j).real() == 0.) { - diffThreshold = _KAT_A::abs(_epsAbs); + diffThreshold = _KAT_A::abs(_absTol); if (diff > diffThreshold) { errorHappened = true; numErrorsRealAbs++; @@ -730,7 +764,7 @@ SyrTester:: jForMaxErrorRealRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).real()); + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).real()); if (diff > diffThreshold) { errorHappened = true; numErrorsRealRel++; @@ -749,7 +783,7 @@ SyrTester:: diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; if (h_expected(i, j).imag() == 0.) { - diffThreshold = _KAT_A::abs(_epsAbs); + diffThreshold = _KAT_A::abs(_absTol); if (diff > diffThreshold) { errorHappened = true; numErrorsImagAbs++; @@ -762,7 +796,7 @@ SyrTester:: jForMaxErrorImagRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).imag()); + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).imag()); if (diff > diffThreshold) { errorHappened = true; numErrorsImagRel++; @@ -929,7 +963,7 @@ SyrTester:: diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); errorHappened = false; if (h_expected(i, j) == 0.) { - diffThreshold = _KAT_A::abs(_epsAbs); + diffThreshold = _KAT_A::abs(_absTol); if (diff > diffThreshold) { errorHappened = true; numErrorsAbs++; @@ -942,7 +976,7 @@ SyrTester:: jForMaxErrorRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j)); + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j)); if (diff > diffThreshold) { errorHappened = true; numErrorsRel++; @@ -1055,7 +1089,7 @@ SyrTester:: diff = _KAT_A::abs(h_reference(i, j).real() - h_A(i, j).real()); errorHappened = false; if (h_reference(i, j).real() == 0.) { - diffThreshold = _KAT_A::abs(_epsAbs); + diffThreshold = _KAT_A::abs(_absTol); if (diff > diffThreshold) { errorHappened = true; numErrorsRealAbs++; @@ -1068,7 +1102,7 @@ SyrTester:: jForMaxErrorRealRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_reference(i, j).real()); + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).real()); if (diff > diffThreshold) { errorHappened = true; numErrorsRealRel++; @@ -1086,7 +1120,7 @@ SyrTester:: diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); errorHappened = false; if (h_reference(i, j).imag() == 0.) { - diffThreshold = _KAT_A::abs(_epsAbs); + diffThreshold = _KAT_A::abs(_absTol); if (diff > diffThreshold) { errorHappened = true; numErrorsImagAbs++; @@ -1099,7 +1133,7 @@ SyrTester:: jForMaxErrorImagRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_reference(i, j).imag()); + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).imag()); if (diff > diffThreshold) { errorHappened = true; numErrorsImagRel++; @@ -1268,7 +1302,7 @@ SyrTester:: diff = _KAT_A::abs(h_reference(i, j) - h_A(i, j)); errorHappened = false; if (h_reference(i, j) == 0.) { - diffThreshold = _KAT_A::abs(_epsAbs); + diffThreshold = _KAT_A::abs(_absTol); if (diff > diffThreshold) { errorHappened = true; numErrorsAbs++; @@ -1281,7 +1315,7 @@ SyrTester:: jForMaxErrorRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_reference(i, j)); + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j)); if (diff > diffThreshold) { errorHappened = true; numErrorsRel++; @@ -1719,7 +1753,7 @@ TEST_F(TestCategory, syr_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double_int"); - test_syr("test case syr_mixed_types"); + test_syr("test case syr_double_int"); Kokkos::Profiling::popRegion(); } #endif From 05aa015ff1b00370ddd7ae6ea45b6a4fbade6044 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 6 Jun 2023 09:00:34 -0600 Subject: [PATCH 050/231] Formatting --- blas/impl/KokkosBlas2_syr_impl.hpp | 50 ++++++++++++++-------------- blas/impl/KokkosBlas2_syr_spec.hpp | 52 +++++++++++++----------------- blas/src/KokkosBlas2_syr.hpp | 11 ++++--- blas/unit_test/Test_Blas2_syr.hpp | 5 +-- 4 files changed, 57 insertions(+), 61 deletions(-) diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index e31e3672ac..439ed588db 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -27,7 +27,8 @@ namespace Impl { // Functor for the thread parallel version of SYR. // This functor parallelizes over rows of the input matrix A. -template +template struct ThreadParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -35,9 +36,7 @@ struct ThreadParallelSYR { ThreadParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, const AViewType& A) - : alpha_(alpha), - x_(x), - A_(A) { + : alpha_(alpha), x_(x), A_(A) { // Nothing to do } @@ -49,7 +48,7 @@ struct ThreadParallelSYR { const XComponentType x_fixed(x_(i)); const IndexType N(A_.extent(1)); - if constexpr(tJustTranspose) { + if constexpr (tJustTranspose) { for (IndexType j = 0; j < N; ++j) { if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { @@ -77,8 +76,7 @@ struct ThreadParallelSYR { // Thread parallel version of SYR. template + class IndexType, bool tJustTranspose, bool tJustUp> void threadParallelSyr(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { @@ -94,8 +92,10 @@ void threadParallelSyr(const ExecutionSpace& space, } else { Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); - ThreadParallelSYR functor(alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[thredParallel]", rangePolicy, functor); + ThreadParallelSYR + functor(alpha, x, A); + Kokkos::parallel_for("KokkosBlas::syr[thredParallel]", rangePolicy, + functor); } } @@ -107,8 +107,7 @@ struct TeamParallelSYR_LayoutRightTag {}; // Functor for the team parallel version of SYR, designed for // performance on GPUs. The kernel depends on the layout of A. template + class IndexType, bool tJustTranspose, bool tJustUp> struct TeamParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -119,9 +118,7 @@ struct TeamParallelSYR { TeamParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, const AViewType& A) - : alpha_(alpha), - x_(x), - A_(A) { + : alpha_(alpha), x_(x), A_(A) { // Nothing to do } @@ -135,7 +132,7 @@ struct TeamParallelSYR { // Nothing to do } else { const IndexType M(A_.extent(0)); - if constexpr(tJustTranspose) { + if constexpr (tJustTranspose) { const XComponentType x_fixed(x_(j)); Kokkos::parallel_for( Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { @@ -168,7 +165,7 @@ struct TeamParallelSYR { } else { const IndexType N(A_.extent(1)); const XComponentType x_fixed(x_(i)); - if constexpr(tJustTranspose) { + if constexpr (tJustTranspose) { Kokkos::parallel_for( Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { if (((tJustUp == true) && (i <= j)) || @@ -198,8 +195,7 @@ struct TeamParallelSYR { // Team parallel version of SYR. template + class IndexType, bool tJustTranspose, bool tJustUp> void teamParallelSyr(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { @@ -231,7 +227,9 @@ void teamParallelSyr(const ExecutionSpace& space, teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TeamParallelSYR functor(alpha, x, A); + TeamParallelSYR + functor(alpha, x, A); Kokkos::parallel_for("KokkosBlas::syr[teamParallel]", teamPolicy, functor); } @@ -245,22 +243,24 @@ void teamParallelSyr(const ExecutionSpace& space, template ()>::type* = nullptr> + typename std::enable_if()>::type* = nullptr> void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - threadParallelSyr< ExecutionSpace, XViewType, AViewType, IndexType - , tJustTranspose, tJustUp>(space, alpha, x, A); + threadParallelSyr(space, alpha, x, A); } template ()>::type* = nullptr> + typename std::enable_if()>::type* = nullptr> void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - teamParallelSyr< ExecutionSpace, XViewType, AViewType, IndexType - , tJustTranspose, tJustUp>(space, alpha, x, A); + teamParallelSyr(space, alpha, x, A); } } // namespace Impl diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 6bc69d8cb9..b07c3a1446 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -85,50 +85,44 @@ struct SYR { const size_type numCols = A.extent(1); bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); - bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); + bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); // Prefer int as the index type, but use a larsyr type if needed. if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { if (justTranspose) { if (justUp) { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int - , true, true>(space, alpha, x, A); + generalSyrImpl( + space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); } - else { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int - , true, false>(space, alpha, x, A); - } - } - else { + } else { if (justUp) { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int - , false, true>(space, alpha, x, A); - } - else { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int - , false, false>(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); } } } else { if (justTranspose) { if (justUp) { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t - , true, true>(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); } - else { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t - , true, false>(space, alpha, x, A); - } - } - else { + } else { if (justUp) { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t - , false, true>(space, alpha, x, A); - } - else { - generalSyrImpl< ExecutionSpace, XViewType, AViewType, int64_t - , false, false>(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); } } } diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 7c0807618e..af66767ab4 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -125,11 +125,12 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], // Minimize the number of Impl::SYR instantiations, by standardizing // on particular View specializations for its template parameters. - using XVT = Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits >; + using XVT = + Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits >; using AVT = Kokkos::View; + using _ViewTypeExpected = + Kokkos::View; - using _KAT_A = Kokkos::ArithTraits; + using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, From a1f1465015bff7acfd76d31a180ecb20fe544abd Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 9 Jun 2023 17:00:07 -0600 Subject: [PATCH 051/231] Addressing latest feedbacks from Carl regarding rocblas --- blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 83c8fd78d7..1184719138 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -64,6 +64,8 @@ namespace Impl { KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr( \ s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ @@ -108,6 +110,8 @@ namespace Impl { KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr( \ s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ @@ -155,6 +159,8 @@ namespace Impl { KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr( \ s.handle, fillMode, N, \ reinterpret_cast(&alpha), \ @@ -173,6 +179,8 @@ namespace Impl { KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher( \ s.handle, fillMode, N, &alpha_val, \ reinterpret_cast(X.data()), one, \ @@ -224,6 +232,8 @@ namespace Impl { KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr( \ s.handle, fillMode, N, \ reinterpret_cast(&alpha), \ @@ -242,6 +252,8 @@ namespace Impl { KokkosBlas::Impl::RocBlasSingleton::singleton(); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher( \ s.handle, fillMode, N, &alpha_val, \ reinterpret_cast(X.data()), one, \ From 086cf75b143e17655273030576900965dba0f24b Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 13 Jun 2023 13:03:08 -0600 Subject: [PATCH 052/231] Update develop to 4.1.99 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbaa3f2ffe..c144a8c107 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 0) +SET(KokkosKernels_VERSION_MINOR 1) SET(KokkosKernels_VERSION_PATCH 99) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") From aee776e44588481b2b99cb047c1b572f77c66ef7 Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Tue, 13 Jun 2023 17:04:17 -0600 Subject: [PATCH 053/231] KokkosKernels: Don't list include for non-existant 'batched' build dir (trilinos/Trilinos#11966) See the embedded comment. This fixes the generated KokkosKernelsTargets.cmake file in the build dir when tests are not enabled. NOTE: This is a very non-conventional CMake build system. Your never see CMakeLists.txt files that are included with include(). These should always be pulled in using add_subdirectory(). If you want to just include() some CMake code, you call it `.cmake` and then include() it. --- batched/CMakeLists.txt | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/batched/CMakeLists.txt b/batched/CMakeLists.txt index 3f13ac5084..3103dfa8a0 100644 --- a/batched/CMakeLists.txt +++ b/batched/CMakeLists.txt @@ -17,9 +17,18 @@ IF (NOT KokkosKernels_ENABLE_COMPONENT_BLAS) LIST(APPEND SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/batched/KokkosBatched_Util.cpp) ENDIF() -# Adding unit-tests -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/batched) -KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/batched) +IF(KokkosKernels_ENABLE_TESTS OR KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) + # Adding unit-tests + KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/batched) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING + ${CMAKE_CURRENT_SOURCE_DIR}/batched) +ENDIF() +# NOTE: Above, the build directory 'batched' is not created unless unit tests +# are actually enabled (which are actually included from the base-level +# CMakeLists.txt file). And the KokkosKernelsTargets.cmake file that gets +# generated from this CMake package in the build dir will be broken if these +# are listed in the `INTERFACE_INCLUDE_DIRECTORIES` property when the build +# `batched` is not created (see Trilinos PR #11966). KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_bll Gemm COMPONENTS batched From 8ec7fee767bc06adb6fe3903bf5d3d99bf976806 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 14 Jun 2023 21:32:13 +0000 Subject: [PATCH 054/231] MKL: switch from int to MKL_INT This allows the library to build against MKL when it is configured to support long long int although no ETI will be provided for that use case. Users will need to specify carefully their types to go through the MKL layer when configured with long long int. --- .../KokkosBlas2_serial_gemv_tpl_spec_decl.hpp | 2 +- perf_test/sparse/KokkosSparse_spadd.cpp | 20 +-- ...osSparse_spgemm_noreuse_tpl_spec_avail.hpp | 14 +- ...kosSparse_spgemm_noreuse_tpl_spec_decl.hpp | 18 +-- ...osSparse_spgemm_numeric_tpl_spec_avail.hpp | 18 ++- ...kosSparse_spgemm_numeric_tpl_spec_decl.hpp | 20 +-- ...sSparse_spgemm_symbolic_tpl_spec_avail.hpp | 16 ++- ...osSparse_spgemm_symbolic_tpl_spec_decl.hpp | 22 ++-- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 8 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 122 +++++++++--------- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 8 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 64 ++++----- 12 files changed, 178 insertions(+), 154 deletions(-) diff --git a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index f689ba079c..6f6a7a2e9f 100644 --- a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -19,7 +19,7 @@ #include "KokkosBlas_util.hpp" #include "KokkosBatched_Vector.hpp" -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && !defined(KOKKOS_ENABLE_SYCL) #include "mkl_version.h" #if __INTEL_MKL__ >= 2018 #define __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ 1 diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index f69d24d523..8e2b6b0efd 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -328,14 +328,18 @@ void run_experiment(int argc, char** argv, CommonInputParams) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL sparse_matrix_t Amkl, Bmkl, Cmkl; if (params.use_mkl) { - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), - (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), - A.values.data())); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), - (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), - B.values.data())); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), + (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), + A.values.data())); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), + (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), + B.values.data())); + } else { + throw std::runtime_error("MKL configured with long long int not supported in Kokkos Kernels"); + } } #endif diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp index 81d3273e17..c59bce46cc 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" +#endif + namespace KokkosSparse { namespace Impl { @@ -63,13 +67,13 @@ SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(Kokkos::complex) template <> \ struct spgemm_noreuse_tpl_spec_avail< \ KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>, \ + SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT>, \ KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>, \ KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>> { \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>> { \ enum : bool { value = true }; \ }; diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp index f3d32a01fb..e0f587f7c1 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp @@ -230,21 +230,21 @@ Matrix spgemm_noreuse_mkl(const MatrixConst &A, const MatrixConst &B) { template <> \ struct SPGEMM_NOREUSE< \ KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>, \ + SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT>, \ KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>, \ KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>, \ true, TPL_AVAIL> { \ using Matrix = KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>; \ + SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT>; \ using ConstMatrix = KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>; \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>; \ static KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int> \ + SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT> \ spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ Kokkos::ArithTraits::name() + "]"; \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index bfba70d913..f0b03f94f8 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" +#endif + namespace KokkosSparse { namespace Impl { @@ -133,30 +137,30 @@ SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex) template <> \ struct spgemm_numeric_tpl_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct SPGEMM_NUMERIC, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>; \ using c_int_view_t = \ - Kokkos::View, \ Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ Kokkos::MemoryTraits>; \ using c_scalar_view_t = \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 80454be92b..fd688e4c42 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -105,21 +109,21 @@ SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex) template <> \ struct spgemm_symbolic_tpl_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index 5db0fa18a9..a55428c2e6 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -594,8 +594,8 @@ void spgemm_symbolic_mkl( handle->set_c_nnz(0); return; } - MKLMatrix A(m, n, (int *)rowptrA.data(), (int *)colidxA.data(), nullptr); - MKLMatrix B(n, k, (int *)rowptrB.data(), (int *)colidxB.data(), nullptr); + MKLMatrix A(m, n, (MKL_INT *)rowptrA.data(), (MKL_INT *)colidxA.data(), nullptr); + MKLMatrix B(n, k, (MKL_INT *)rowptrB.data(), (MKL_INT *)colidxB.data(), nullptr); sparse_matrix_t C; matrix_descr generalDescr; generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; @@ -625,32 +625,32 @@ void spgemm_symbolic_mkl( template <> \ struct SPGEMM_SYMBOLIC< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>; \ using c_int_view_t = \ - Kokkos::View, \ Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ Kokkos::MemoryTraits>; \ static void spgemm_symbolic(KernelHandle *handle, \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 6846e27748..eb0ab13c76 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -17,6 +17,10 @@ #ifndef KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Experimental { namespace Impl { @@ -124,8 +128,8 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ + const SCALAR, const MKL_INT, Kokkos::Device,\ + Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 36a64228b8..10b609733c 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -42,14 +42,14 @@ inline matrix_descr getDescription() { } inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, - int m, int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, + MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL( @@ -57,15 +57,15 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, } inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, + double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL( @@ -74,16 +74,16 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -95,16 +95,16 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); matrix_descr A_descr = getDescription(); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; @@ -115,15 +115,15 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, } inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, - float beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, + float beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, const float* x, int colx, int ldx, float* y, int ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, @@ -132,15 +132,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, } inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, + double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, int colx, int ldx, double* y, int ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, @@ -150,17 +150,17 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, inline void spm_mv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, - const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -173,15 +173,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, inline void spm_mv_block_impl_mkl( sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, int b, const int* Arowptrs, - const int* Aentries, const Kokkos::complex* Avalues, + Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); matrix_descr A_descr = getDescription(); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; @@ -196,25 +196,25 @@ inline void spm_mv_block_impl_mkl( #if (__INTEL_MKL__ == 2017) -inline void spmv_block_impl_mkl(char mode, float alpha, float beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, +inline void spmv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { mkl_sbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spmv_block_impl_mkl(char mode, double alpha, double beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const double* Avalues, +inline void spmv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { mkl_dbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -229,8 +229,8 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -245,18 +245,18 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); } -inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, +inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, const float* x, int colx, int ldx, float* y, int ldy) { mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const double* Avalues, +inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, const double* x, int colx, int ldx, double* y, int ldy) { mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, @@ -264,9 +264,9 @@ inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, int m, } inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, - const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { @@ -282,7 +282,7 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, inline void spm_mv_block_impl_mkl( char mode, Kokkos::complex alpha, Kokkos::complex beta, - int m, int n, int b, const int* Arowptrs, const int* Aentries, + MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { const MKL_Complex16* alpha_mkl = @@ -301,16 +301,16 @@ inline void spm_mv_block_impl_mkl( #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, int const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -358,8 +358,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const**, \ + SCALAR const, MKL_INT const, Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const**, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, \ SCALAR**, Kokkos::LayoutLeft, \ @@ -367,8 +367,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, int const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 4a92741cc5..8e902551cd 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -17,6 +17,10 @@ #ifndef KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -214,8 +218,8 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 6cbd1fff29..bafaac30b6 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -522,8 +522,8 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, - int n, const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, + MKL_INT n, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; @@ -531,15 +531,15 @@ inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } -inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, - int n, const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, MKL_INT m, + MKL_INT n, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; @@ -547,16 +547,16 @@ inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -566,8 +566,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -577,8 +577,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -588,8 +588,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; @@ -601,16 +601,16 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, int const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -647,23 +647,23 @@ inline char mode_kk_to_mkl(char mode_kk) { "Invalid mode for MKL (should be one of N, T, H)"); } -inline void spmv_mkl(char mode, float alpha, float beta, int m, int n, - const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(char mode, float alpha, float beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spmv_mkl(char mode, double alpha, double beta, int m, int n, - const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(char mode, double alpha, double beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -678,8 +678,8 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, } inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -697,16 +697,16 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, int const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ From 4e054a1922bbd3906ceacb311e3187911c2c15b8 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 14 Jun 2023 15:38:14 -0600 Subject: [PATCH 055/231] MKL_INT: applying clang-format --- perf_test/sparse/KokkosSparse_spadd.cpp | 3 +- ...osSparse_spgemm_noreuse_tpl_spec_avail.hpp | 27 ++--- ...kosSparse_spgemm_noreuse_tpl_spec_decl.hpp | 60 ++++++----- ...osSparse_spgemm_numeric_tpl_spec_avail.hpp | 66 ++++++------ ...kosSparse_spgemm_numeric_tpl_spec_decl.hpp | 20 ++-- ...sSparse_spgemm_symbolic_tpl_spec_avail.hpp | 42 ++++---- ...osSparse_spgemm_symbolic_tpl_spec_decl.hpp | 100 +++++++++--------- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 3 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 97 +++++++++-------- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 5 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 30 +++--- 11 files changed, 242 insertions(+), 211 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 8e2b6b0efd..e8a0b19419 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -338,7 +338,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())); } else { - throw std::runtime_error("MKL configured with long long int not supported in Kokkos Kernels"); + throw std::runtime_error( + "MKL configured with long long int not supported in Kokkos Kernels"); } } #endif diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp index c59bce46cc..ea3edb518f 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp @@ -63,18 +63,21 @@ SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NOREUSE_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_noreuse_tpl_spec_avail< \ - KokkosSparse::CrsMatrix< \ - SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const MKL_INT, Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const MKL_INT, Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT>> { \ - enum : bool { value = true }; \ +#define SPGEMM_NOREUSE_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_noreuse_tpl_spec_avail< \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>> { \ + enum : bool { value = true }; \ }; #define SPGEMM_NOREUSE_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp index e0f587f7c1..1067f3924f 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp @@ -226,33 +226,39 @@ Matrix spgemm_noreuse_mkl(const MatrixConst &A, const MatrixConst &B) { return Matrix("C", m, k, c_nnz, valuesC, row_mapC, entriesC); } -#define SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_NOREUSE< \ - KokkosSparse::CrsMatrix< \ - SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const MKL_INT, Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const MKL_INT, Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT>, \ - true, TPL_AVAIL> { \ - using Matrix = KokkosSparse::CrsMatrix< \ - SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT>; \ - using ConstMatrix = KokkosSparse::CrsMatrix< \ - const SCALAR, const MKL_INT, Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT>; \ - static KokkosSparse::CrsMatrix< \ - SCALAR, MKL_INT, Kokkos::Device, void, MKL_INT> \ - spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ - std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - Matrix C = spgemm_noreuse_mkl(A, B); \ - Kokkos::Profiling::popRegion(); \ - return C; \ - } \ +#define SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ + template <> \ + struct SPGEMM_NOREUSE< \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + true, TPL_AVAIL> { \ + using Matrix = \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>; \ + using ConstMatrix = KokkosSparse::CrsMatrix< \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>; \ + static KokkosSparse::CrsMatrix, \ + void, MKL_INT> \ + spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ + std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + Matrix C = spgemm_noreuse_mkl(A, B); \ + Kokkos::Profiling::popRegion(); \ + return C; \ + } \ }; #define SPGEMM_NOREUSE_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index f0b03f94f8..e144b53162 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -133,40 +133,40 @@ SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_numeric_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ +#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_numeric_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_NUMERIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index efd5b063f7..6c87c60caf 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -553,30 +553,30 @@ void spgemm_numeric_mkl( #define SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ template <> \ struct SPGEMM_NUMERIC, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>; \ using c_int_view_t = \ - Kokkos::View, \ Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ Kokkos::MemoryTraits>; \ using c_scalar_view_t = \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index fd688e4c42..1fcfa7132a 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -105,28 +105,28 @@ SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_symbolic_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ +#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_symbolic_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_SYMBOLIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index a55428c2e6..e662934d00 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -594,8 +594,10 @@ void spgemm_symbolic_mkl( handle->set_c_nnz(0); return; } - MKLMatrix A(m, n, (MKL_INT *)rowptrA.data(), (MKL_INT *)colidxA.data(), nullptr); - MKLMatrix B(n, k, (MKL_INT *)rowptrB.data(), (MKL_INT *)colidxB.data(), nullptr); + MKLMatrix A(m, n, (MKL_INT *)rowptrA.data(), (MKL_INT *)colidxA.data(), + nullptr); + MKLMatrix B(n, k, (MKL_INT *)rowptrB.data(), (MKL_INT *)colidxB.data(), + nullptr); sparse_matrix_t C; matrix_descr generalDescr; generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; @@ -621,53 +623,53 @@ void spgemm_symbolic_mkl( handle->set_c_nnz(rowptrC(m)); } -#define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_SYMBOLIC< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>; \ - using c_int_view_t = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void spgemm_symbolic(KernelHandle *handle, \ - typename KernelHandle::nnz_lno_t m, \ - typename KernelHandle::nnz_lno_t n, \ - typename KernelHandle::nnz_lno_t k, \ - c_int_view_t row_mapA, c_int_view_t entriesA, \ - bool, c_int_view_t row_mapB, \ - c_int_view_t entriesB, bool, \ - int_view_t row_mapC, bool) { \ - std::string label = "KokkosSparse::spgemm_symbolic[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spgemm_symbolic_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, \ - entriesA, row_mapB, entriesB, row_mapC); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ + template <> \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spgemm_symbolic(KernelHandle *handle, \ + typename KernelHandle::nnz_lno_t m, \ + typename KernelHandle::nnz_lno_t n, \ + typename KernelHandle::nnz_lno_t k, \ + c_int_view_t row_mapA, c_int_view_t entriesA, \ + bool, c_int_view_t row_mapB, \ + c_int_view_t entriesB, bool, \ + int_view_t row_mapC, bool) { \ + std::string label = "KokkosSparse::spgemm_symbolic[TPL_MKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spgemm_symbolic_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, \ + entriesA, row_mapB, entriesB, row_mapC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define SPGEMM_SYMBOLIC_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index eb0ab13c76..b9c1f6c1dd 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -128,7 +128,8 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const MKL_INT, Kokkos::Device,\ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 10b609733c..d37e394b65 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -42,7 +42,8 @@ inline matrix_descr getDescription() { } inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, - MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; @@ -58,9 +59,9 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const double* Avalues, const double* x, - double* y) { + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, + const double* x, double* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -74,8 +75,9 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -95,8 +97,9 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -116,9 +119,10 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const float* Avalues, const float* x, - int colx, int ldx, float* y, int ldy) { + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, int colx, int ldx, float* y, + int ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -133,7 +137,8 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, const double* x, int colx, int ldx, double* y, int ldy) { sparse_matrix_t A_mkl; @@ -150,8 +155,8 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, inline void spm_mv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - MKL_INT b, const MKL_INT* Arowptrs, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, int colx, @@ -173,10 +178,10 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, inline void spm_mv_block_impl_mkl( sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, int ldx, - Kokkos::complex* y, int ldy) { + Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Kokkos::complex* Avalues, const Kokkos::complex* x, + int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -213,8 +218,9 @@ inline void spmv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -229,8 +235,9 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -254,18 +261,19 @@ inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const double* Avalues, - const double* x, int colx, int ldx, double* y, - int ldy) { +inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, + MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, + const double* Avalues, const double* x, + int colx, int ldx, double* y, int ldy) { mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); } inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - MKL_INT b, const MKL_INT* Arowptrs, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, int colx, @@ -280,11 +288,14 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); } -inline void spm_mv_block_impl_mkl( - char mode, Kokkos::complex alpha, Kokkos::complex beta, - MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, const Kokkos::complex* x, - int colx, int ldx, Kokkos::complex* y, int ldy) { +inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, int colx, + int ldx, Kokkos::complex* y, + int ldy) { const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); @@ -301,16 +312,17 @@ inline void spm_mv_block_impl_mkl( #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, MKL_INT const, Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, MKL_INT const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -358,8 +370,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, MKL_INT const, Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const**, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const**, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, \ SCALAR**, Kokkos::LayoutLeft, \ @@ -367,8 +380,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, MKL_INT const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 8e902551cd..060fef45bb 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -218,8 +218,9 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ - const SCALAR, const MKL_INT, Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index bafaac30b6..ecbe45c7fd 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -523,8 +523,9 @@ namespace Impl { // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, - MKL_INT n, const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const float* Avalues, const float* x, float* y) { + MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, float* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; @@ -538,9 +539,10 @@ inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } -inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, MKL_INT m, - MKL_INT n, const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const double* Avalues, const double* x, double* y) { +inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, + MKL_INT m, MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, + const double* x, double* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; @@ -601,16 +603,17 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, MKL_INT const, Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, MKL_INT const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -697,16 +700,17 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, MKL_INT const, Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, MKL_INT const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ From c4bdc88fa50fd722f2bb5831bb37ab2bcc6e7bbc Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 15 Jun 2023 01:25:52 -0600 Subject: [PATCH 056/231] Correcting bug causing compilation errors with Trilinos (configurations of Kokkos and KK requesting CudaUVM) --- blas/unit_test/Test_Blas2_syr.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index f79806fb28..83f8a8c175 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -155,7 +155,7 @@ class SyrTester { void callKkGerAndCompareKkSyrAgainstIt( const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& org_A, - const _ViewTypeExpected& h_A_syr, const std::string& situation); + const _HostViewTypeA& h_A_syr, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -1429,7 +1429,7 @@ void SyrTester:: callKkGerAndCompareKkSyrAgainstIt( const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& org_A, - const _ViewTypeExpected& h_A_syr, const std::string& situation) { + const _HostViewTypeA& h_A_syr, const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, org_A.d_base); From 930be1da4f327e324a634a7c12dfd6828a98daf4 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 15 Jun 2023 16:34:52 +0000 Subject: [PATCH 057/231] MKL_INT: final change for colx, ldx and ldy in bsr spmv These changes reflect the same type usage as the indices used for matrix indexing. --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index d37e394b65..db9f6029ea 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -121,8 +121,8 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, - const float* x, int colx, int ldx, float* y, - int ldy) { + const float* x, MKL_INT colx, MKL_INT ldx, float* y, + MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -140,7 +140,7 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, - int colx, int ldx, double* y, int ldy) { + MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -159,8 +159,8 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, - int ldx, Kokkos::complex* y, int ldy) { + const Kokkos::complex* x, MKL_INT colx, + MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -181,7 +181,7 @@ inline void spm_mv_block_impl_mkl( Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, - int colx, int ldx, Kokkos::complex* y, int ldy) { + MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -255,8 +255,8 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, - const float* x, int colx, int ldx, float* y, - int ldy) { + const float* x, MKL_INT colx, MKL_INT ldx, float* y, + MKL_INT ldy) { mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } @@ -266,7 +266,7 @@ inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, - int colx, int ldx, double* y, int ldy) { + MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); } @@ -276,8 +276,8 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, - int ldx, Kokkos::complex* y, int ldy) { + const Kokkos::complex* x, MKL_INT colx, + MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); const MKL_Complex8* Avalues_mkl = @@ -293,9 +293,9 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, - int ldx, Kokkos::complex* y, - int ldy) { + const Kokkos::complex* x, MKL_INT colx, + MKL_INT ldx, Kokkos::complex* y, + MKL_INT ldy) { const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); @@ -396,9 +396,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - int colx = static_cast(X.extent(1)); \ - int ldx = static_cast(X.stride_1()); \ - int ldy = static_cast(Y.stride_1()); \ + MKL_INT colx = static_cast(X.extent(1)); \ + MKL_INT ldx = static_cast(X.stride_1()); \ + MKL_INT ldy = static_cast(Y.stride_1()); \ spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ A.numCols(), A.blockDim(), A.graph.row_map.data(), \ A.graph.entries.data(), A.values.data(), X.data(), \ From 4fc51ea866dba7ee25797cbb4aaa7e75a0284945 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 15 Jun 2023 10:38:39 -0600 Subject: [PATCH 058/231] MKL_INT: clang-format --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 50 +++++++++---------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index db9f6029ea..c6136eab3e 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -121,8 +121,8 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, - const float* x, MKL_INT colx, MKL_INT ldx, float* y, - MKL_INT ldy) { + const float* x, MKL_INT colx, MKL_INT ldx, + float* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -140,7 +140,8 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, - MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { + MKL_INT colx, MKL_INT ldx, double* y, + MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -153,14 +154,12 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, ldx, beta, y, ldy)); } -inline void spm_mv_block_impl_mkl(sparse_operation_t op, - Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, MKL_INT colx, - MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { +inline void spm_mv_block_impl_mkl( + sparse_operation_t op, Kokkos::complex alpha, + Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Kokkos::complex* Avalues, const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, @@ -255,18 +254,16 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, - const float* x, MKL_INT colx, MKL_INT ldx, float* y, - MKL_INT ldy) { + const float* x, MKL_INT colx, MKL_INT ldx, + float* y, MKL_INT ldy) { mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, - MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const double* Avalues, const double* x, - MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { +inline void spm_mv_block_impl_mkl( + char mode, double alpha, double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, + const double* x, MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); } @@ -277,7 +274,8 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, MKL_INT colx, - MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { + MKL_INT ldx, Kokkos::complex* y, + MKL_INT ldy) { const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); const MKL_Complex8* Avalues_mkl = @@ -293,9 +291,9 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, MKL_INT colx, - MKL_INT ldx, Kokkos::complex* y, - MKL_INT ldy) { + const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, + Kokkos::complex* y, MKL_INT ldy) { const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); @@ -396,9 +394,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - MKL_INT colx = static_cast(X.extent(1)); \ - MKL_INT ldx = static_cast(X.stride_1()); \ - MKL_INT ldy = static_cast(Y.stride_1()); \ + MKL_INT colx = static_cast(X.extent(1)); \ + MKL_INT ldx = static_cast(X.stride_1()); \ + MKL_INT ldy = static_cast(Y.stride_1()); \ spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ A.numCols(), A.blockDim(), A.graph.row_map.data(), \ A.graph.entries.data(), A.values.data(), X.data(), \ From a7c2f6d25c9a6de3071a9ddf0193391ef5f6f654 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 15 Jun 2023 11:56:31 -0600 Subject: [PATCH 059/231] Fix for rocblas builds - Rename uploChar to uplo in fillMode for consistency with usage - Dereference uplo in fillMode usage for comparison to char --- blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 1184719138..46a355c088 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -28,7 +28,7 @@ namespace Impl { const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + rocblas_fill fillMode = (*uplo == 'L' || *uplo == 'l') \ ? rocblas_fill_lower \ : rocblas_fill_upper; From f5e2f8254aaa4e5e3dd4caa51f1597098fff4641 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 15 Jun 2023 12:05:12 -0600 Subject: [PATCH 060/231] apply clang-format --- blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 46a355c088..13e2bd21b1 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -28,9 +28,8 @@ namespace Impl { const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (*uplo == 'L' || *uplo == 'l') \ - ? rocblas_fill_lower \ - : rocblas_fill_upper; + rocblas_fill fillMode = (*uplo == 'L' || *uplo == 'l') ? rocblas_fill_lower \ + : rocblas_fill_upper; #define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ ETI_SPEC_AVAIL) \ From c722dc0a5e1bb4390eff83d9ef72cd3312ac7156 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 16 Jun 2023 14:07:39 -0600 Subject: [PATCH 061/231] Backward-compatible fix with kokkos@4.0 --- batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 7a93309e65..f413ba612c 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -94,9 +94,11 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, case BaseKokkosBatchedAlgos::KK_SERIAL: case BaseHeuristicAlgos::SQUARE: case BaseTplAlgos::ARMPL: +#if KOKKOS_VERSION > 40099 assert(A.rank_dynamic() == 3 && "AViewType must have rank 3."); assert(B.rank_dynamic() == 3 && "BViewType must have rank 3."); assert(C.rank_dynamic() == 3 && "CViewType must have rank 3."); +#endif break; default: std::ostringstream os; From 4a1a763c1314ab67c2b7db3eba9a1f4e90cefab6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 20 Jun 2023 13:18:01 -0600 Subject: [PATCH 062/231] docs: Add testing table --- docs/_static/table_theme.css | 5 + docs/conf.py | 4 +- docs/developer/index.rst | 2 +- docs/developer/testing_table.rst | 526 +++++++++++++++++++++++++++++++ 4 files changed, 535 insertions(+), 2 deletions(-) create mode 100644 docs/_static/table_theme.css create mode 100644 docs/developer/testing_table.rst diff --git a/docs/_static/table_theme.css b/docs/_static/table_theme.css new file mode 100644 index 0000000000..7271d0e2fd --- /dev/null +++ b/docs/_static/table_theme.css @@ -0,0 +1,5 @@ +.wy-nav-content { + height: 100%; + max-width: 100% !important; + margin: auto; +} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index f7027880c5..cfed3629aa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,4 +79,6 @@ def configureDoxyfile(input_dir, output_dir, doxyfile_in, doxyfile_out): # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] +html_static_path = ['_static'] + +html_css_files = ['table_theme.css'] diff --git a/docs/developer/index.rst b/docs/developer/index.rst index 58f89084ac..c95a3ee63b 100644 --- a/docs/developer/index.rst +++ b/docs/developer/index.rst @@ -9,4 +9,4 @@ Developer Manual Code Style Guide Contributing Profiling - \ No newline at end of file + Testing Table \ No newline at end of file diff --git a/docs/developer/testing_table.rst b/docs/developer/testing_table.rst new file mode 100644 index 0000000000..4b9600fa04 --- /dev/null +++ b/docs/developer/testing_table.rst @@ -0,0 +1,526 @@ +Testing Table +============= + +SAND2023-05267O [#]_ + +Below is a testing table summarizing the KokkosKernels continuous integration and nightly test coverage. + +The following is a description of abbreviations used throughout the testing table. + +* ETI: Explicit template instantiation +* PR: Pull Request +* LEFT: LayoutLeft +* RIGHT: LayoutRight +* REL: CMake release build type +* DBG: CMake debug build type +* BCHK: Kokkos core bounds checking +* UVM: Unified Memory (Cuda) + +The following is a description of column headings in the testing table. + +* Project: the jenkins project name for the test case +* Architectures: the test case's coverage architectures +* Compilers: the covered compilers +* Backends: the covered kokkos core backends +* Scalars: the covered ETI'd scalar types +* Ordinals: the covered ETI'd ordinal types +* Offsets: the covered ETI'd offset types +* Layouts: the covered ETI'd kokkos core layout types + +.. list-table:: + :align: center + :header-rows: 1 + :stub-columns: 0 + :width: 100% + :widths: auto + + + * - Project + - Architectures + - Compilers + - Backends + - Scalars + - Ordinals + - Offsets + - Layouts + + * * `PR_A64FX_ARMPL2110_OPENMP_LEFT_OPENBLAS_OPENLAPACK_REL` + * A64FX + * ARMPL 21.1.10 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_A64FX_ARMPL2110_OPENMP_LEFT_OPENBLAS_OPENLAPACK_REL` + * A64FX + * ARMPL 21.1.10 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_A64FX_GCC1020_OPENMP_SERIAL_LEFT_REL` + * A64FX + * GNU 10.2.0 + * OpenMP,Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_VEGA908_ROCM520_HIP_SERIAL_LEFT_REL` + * VEGA908 + * ROCM 5.2.0 + * Hip, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PRTONIGHTLY_VEGA908_ROCM520_HIP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL` + * VEGA908 + * ROCM 5.2.0 + * Hip, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_OPENMP_SERIAL_CUDA_LEFT_OPENBLAS_OPENLAPACK_REL` + * Power8, Pascal60 -- Power9, Volta70 + * GNU 9.3.0 -- Clang 13.0.0, Cuda 10.1.243 + * OpenMp, Serial -- Cuda + * double, `complex_double` + * int + * int, size_t + * LayoutLeft + + * * `PR_POWER9_VOLTA70_CUDA11_OPENMP_CUDA_LEFT_RIGHT_REL` + * Power9, Volta70 + * GNU 8.3.1, Cuda 11.2.2 + * Cuda, OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_SKX_GNU1020_OPENMP_LEFT_REL` + * Skx + * GNU 10.2.0 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_SKX_GNU1020_THREADS_SERIAL_RIGHT_REL` + * Skx + * GNU 10.2.0 + * Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutRight + + * * `PR_SKX_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL` + * Skx + * GNU 10.2.0 + * Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_SKX_INTEL19_OPENMP_LEFT_MKLBLAS_MKLLAPACK_REL` + * Skx + * Intel 19.5.281 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_SKX_CLANG1001_THREADS_SERIAL_LEFT_REL` + * Skx + * Clang 10.0.1 + * Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_NONE_CLANG14001_SERIAL_LEFT_RIGHT_RELWITHDBG_BCHK` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_NONE_CLANG14001_THREADS_LEFT_RIGHT_RELWITHDBG_BCHK` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_NONE_CLANG14001_SERIAL_LEFT_RIGHT_DBG` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_NONE_CLANG14001_SERIAL_LEFT_RIGHT_REL_BCHK` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `NIGHTLY_SKX_GNU1020_OPENMP_THREADS_SERIAL_LEFT_DBG` + * SKX + * GNU 10.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_GNU820_OPENMP_THREADS_SERIAL_LEFT_DBG` + * SKX + * GNU 8.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_GNU820_OPENMP_THREADS_SERIAL_LEFT_REL` + * SKX + * GNU 8.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_GNU920_OPENMP_THREADS_SERIAL_LEFT_DBG` + * SKX + * GNU 9.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_OPENMP_LEFT_DBG` + * SKX + * Intel 19.0.5 + * OpenMp + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_SERIAL_LEFT_DBG` + * SKX + * Intel 19.0.5 + * Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_THREADS_LEFT_DBG` + * SKX + * Intel 19.0.5 + * Threads + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_OPENMP_LEFT_MKL_DBG` + * SKX + * Intel 19.0.5 + * OPENMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_OPENMP_CUDA_LEFT_REL` + * SKX + * Cuda 11.2.2 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_REL` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_REL_UVM_RDC` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_DBG_BCHK` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_CUBLAS_CUSPARSE_REL_BCHK` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA908_ROCM520_SERIAL_HIP_LEFT_REL` + * VEGA908 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA908_ROCM520_SERIAL_HIP_LEFT_ROCBLAS_ROCSPARSE_REL` + * VEGA908 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA906_ROCM520_SERIAL_HIP_LEFT_REL` + * VEGA906 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA906_ROCM520_SERIAL_HIP_LEFT_DBG_BCHK` + * VEGA906 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_AMPERE80_CUDA11_SERIAL_CUDA_LEFT_DBG` + * AMPHERE80 + * Cuda 11.7.99 + * Serial, Cuda + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG10_SERIAL_OPENMP_THREADS_LEFT_REL` + * Volta70 + * Clang 10.0.0 + * Serial, OpenMP, Threads + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA10_CUDA_SERIAL_LEFT_RELWITHDBG` + * Volta70 + * Cuda 10.1 + * Serial, Cuda + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA117_CUDA_SERIAL_LEFT_RELWITHDBG` + * Volta70 + * Cuda 11.7 + * Serial, Cuda + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG900_SERIAL_THREADS_LEFT_REL` + * Volta70 + * Clang 9.0.0 + * Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG900_SERIAL_THREADS_LEFT_DBG` + * Volta70 + * Clang 9.0.0 + * Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG900_SERIAL_THREADS_LEFT_REL_CPP20` + * Volta70 + * Clang 9.0.0 + * Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA110_CUDA_OPENMP_LEFT_REL` + * Volta70 + * Cuda 11.0 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA120_CUDA_OPENMP_LEFT_REL` + * Volta70 + * Cuda 12.0 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA120_CUDA_OPENMP_LEFT_REL` + * Volta70 + * Cuda 12.0 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_GNU830_SERIAL_OPENMP_THREADS_LEFT_REL` + * Volta70 + * Gnu 8.3.0 + * OpenMP, `OpenMP_Serial`, Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_GNU910_GNU920_SERIAL_OPENMP_THREADS_LEFT_REL` + * Volta70 + * Gnu 9.1.0, Gnu 9.2.0 + * OpenMP, `OpenMP_Serial`, Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_GNU830_GNU910_SERIAL_OPENMP_LEFT_OPENBLAS_OPENLAPACK_REL` + * Volta70 + * Gnu 9.1.0, Gnu 9.2.0 + * OpenMP, `OpenMP_Serial`, Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_A64FX_ARMPL2030_SERIAL_OPENMP_LEFT_ARMPLLBLAS_ARMPLSLAPACK_REL` + * A64FX + * Armpl 20.3.0 + * OpenMP, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP_SERIAL_PTHREAD_LEFT_REL` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP`, `SERIAL_PTHREAD_LEFT_DBG_BCHK` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP`, `SERIAL_PTHREAD_LEFT_DBG_BCHK` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP`, `SERIAL_PTHREAD_LEFT_REL_UVM` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_HSW_INTEL19_OPENMP_LEFT_RELWITHDBG` + * Hsw + * Intel 19.1.3.20200925 + * OpenMP + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_KNL_INTEL19_OPENMP_LEFT_RELWITHDBG` + * Hsw + * Intel 19.1.3.20200925 + * OpenMP + * double + * int + * `size_t` + * LayoutLeft + +.. rubric:: Footnotes + +.. [#] This article has been authored by an employee of National Technology & Engineering Solutions of Sandia, LLC under Contract No. DE-NA0003525 with the U.S. Department of Energy (DOE). The employee owns all right, title and interest in and to the article and is solely responsible for its contents. The United States Government retains and the publisher, by accepting the article for publication, acknowledges that the United States Government retains a non-exclusive, paid-up, irrevocable, world-wide license to publish or reproduce the published form of this article or allow others to do so, for United States Government purposes. The DOE will provide public access to these results of federally sponsored research in accordance with the DOE Public Access Plan https://www.energy.gov/downloads/doe-public-access-plan. SAND2023-05267O. \ No newline at end of file From ee943cda7fdd86f46f3d93de7889ae492c709fd2 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 22 Jun 2023 15:54:26 -0600 Subject: [PATCH 063/231] use DOWNLOAD_EXTRACT_TIMESTAMP in FetchContent_Declare when supported --- cmake/kokkoskernels_benchmarks.cmake | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/cmake/kokkoskernels_benchmarks.cmake b/cmake/kokkoskernels_benchmarks.cmake index 3a38feee88..07f0515b4e 100644 --- a/cmake/kokkoskernels_benchmarks.cmake +++ b/cmake/kokkoskernels_benchmarks.cmake @@ -18,11 +18,27 @@ ELSE() # Note: recent bug (google/benchmark#1441) is preventing us from using # the latest benchmark release. SET(BENCHMARK_VERSION 1.6.2) - FetchContent_Declare( - googlebenchmark - URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b - ) + + # CMake 3.24 introduced DOWNLOAD_EXTRACT_TIMESTAMP, which controls whether + # extracting this file archive sets the file times to archive time (TRUE), + # or to extraction time (FALSE). + # In CMake 3.24+, the default is FALSE + # Prior, it did not exist, and was effectively TRUE + # Here, we okay the new default to silence CMP0135 warning + IF (${CMAKE_VERSION} VERSION_LESS "3.24.0") + FetchContent_Declare( + googlebenchmark + URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz + URL_HASH MD5=14d14849e075af116143a161bc3b927b + ) + ELSE() + FetchContent_Declare( + googlebenchmark + URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz + URL_HASH MD5=14d14849e075af116143a161bc3b927b + DOWNLOAD_EXTRACT_TIMESTAMP FALSE + ) + ENDIF() FetchContent_MakeAvailable(googlebenchmark) LIST(POP_BACK CMAKE_MESSAGE_INDENT) From d0af7f1e6d21c3c3b11dce11db46369216abc27c Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 22 Jun 2023 17:24:15 -0600 Subject: [PATCH 064/231] Improve help text in perf tests (#1875) - Have common infrastructure detect "-h" and "--help", and print options - In help text, distinguish between enabled and non-enabled backends --- .../KokkosKernels_perf_test_instantiation.hpp | 10 ++ .../KokkosKernels_perf_test_utilities.hpp | 92 ++++++++++++++----- perf_test/sparse/KokkosSparse_spadd.cpp | 7 -- perf_test/sparse/KokkosSparse_spgemm.cpp | 7 -- 4 files changed, 78 insertions(+), 38 deletions(-) diff --git a/perf_test/KokkosKernels_perf_test_instantiation.hpp b/perf_test/KokkosKernels_perf_test_instantiation.hpp index 9ed5ec23bc..6844922ddb 100644 --- a/perf_test/KokkosKernels_perf_test_instantiation.hpp +++ b/perf_test/KokkosKernels_perf_test_instantiation.hpp @@ -26,10 +26,20 @@ #error "The macro KOKKOSKERNELS_PERF_TEST_NAME was not defined" #endif +// All perf tests must implement print_options() +void print_options(); + int main_instantiation(int argc, char** argv) { perf_test::CommonInputParams params; perf_test::parse_common_options(argc, argv, params); + // If help is requested with "-h" or "--help", then just print the options + // and quit. + if (params.print_help) { + print_options(); + return 0; + } + /* Assumption is that use_openmp/use_threads variables are */ /* provided as numbers of threads */ int num_threads = 1; diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index 0df96f4494..1303b2370e 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -33,20 +33,56 @@ struct CommonInputParams { int use_openmp = 0; int use_threads = 0; - int repeat = 0; + int repeat = 0; + bool print_help = false; }; std::string list_common_options() { std::ostringstream common_options; common_options - << "\t[Required] BACKEND:\n" - << "\t\t'--threads [numThreads]' |\n" - << "\t\t'--openmp [numThreads]' |\n" - << "\t\t'--cuda [deviceIndex]' |\n" - << "\t\t'--hip [deviceIndex]' |\n" - << "\t\t'--sycl [deviceIndex]'\n\n" - << "\tIf no parallel backend is requested, Serial will be used " - "(if enabled)\n\n"; + << "\t[Required] Backend: the available backends are:\n" +#ifdef KOKKOS_ENABLE_THREADS + << "\t\t'--threads [numThreads]'\n" +#endif +#ifdef KOKKOS_ENABLE_OPENMP + << "\t\t'--openmp [numThreads]'\n" +#endif +#ifdef KOKKOS_ENABLE_CUDA + << "\t\t'--cuda [deviceIndex]'\n" +#endif +#ifdef KOKKOS_ENABLE_HIP + << "\t\t'--hip [deviceIndex]'\n" +#endif +#ifdef KOKKOS_ENABLE_SYCL + << "\t\t'--sycl [deviceIndex]'\n" +#endif +#ifdef KOKKOS_ENABLE_SERIAL + << "\t\tIf no parallel backend is requested, Serial will be used.\n" +#endif + << "\n" + << "\t The following backends are not available because Kokkos was not " + "configured with them:\n" +#ifndef KOKKOS_ENABLE_THREADS + << "\t\t'--threads [numThreads]'\n" +#endif +#ifndef KOKKOS_ENABLE_OPENMP + << "\t\t'--openmp [numThreads]'\n" +#endif +#ifndef KOKKOS_ENABLE_CUDA + << "\t\t'--cuda [deviceIndex]'\n" +#endif +#ifndef KOKKOS_ENABLE_HIP + << "\t\t'--hip [deviceIndex]'\n" +#endif +#ifndef KOKKOS_ENABLE_SYCL + << "\t\t'--sycl [deviceIndex]'\n" +#endif +#ifndef KOKKOS_ENABLE_SERIAL + << "\t\tSerial is not enabled so a parallel backend must be selected.\n" +#endif + << "\n" + << "\t[Optional]:\n" + << "\t\t'-h', '--help': show available options\n\n"; return common_options.str(); } @@ -155,34 +191,42 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { // If e.g. params.use_cuda is 0, that means CUDA will not be used at all. // But if it's N, then it means run on CUDA device N-1. while (argIdx < argc) { - bool remove_flag = false; + // How many flags to delete from argc/argv + // 0: not a common option, so leave it + // 1: a bool parameter like '-h' + // 2: a parameter followed by a value, like "--cuda 0" + int remove_flags = 0; if (check_arg_int(argIdx, argc, argv, "--threads", params.use_threads)) { - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--openmp", params.use_openmp)) { - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--cuda", params.use_cuda)) { params.use_cuda++; - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--hip", params.use_hip)) { params.use_hip++; - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--sycl", params.use_sycl)) { params.use_sycl++; - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--repeat", params.repeat)) { - remove_flag = true; + remove_flags = 2; + } else if (check_arg_bool(argIdx, argc, argv, "-h", params.print_help) || + check_arg_bool(argIdx, argc, argv, "--help", + params.print_help)) { + remove_flags = 1; } - if (remove_flag) { - // Shift the remainder of the argv list by one. Note that argv has - // (argc + 1) arguments, the last one always being nullptr. The following - // loop moves the trailing nullptr element as well - for (int k = argIdx; k < argc - 1; ++k) { - argv[k] = argv[k + 2]; - argv[k + 1] = argv[k + 3]; + if (remove_flags) { + // Shift the remainder of the argv list left by the number of flags + // removed. Note that argv has (argc + 1) arguments, the last one always + // being nullptr. The following loop moves the trailing nullptr element + // as well + for (int k = argIdx + remove_flags; k <= argc; ++k) { + argv[k - remove_flags] = argv[k]; } - argc = argc - 2; + argc -= remove_flags; } else { ++argIdx; } diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index e8a0b19419..f27d7d93db 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -90,7 +90,6 @@ void print_options() { } int parse_inputs(LocalParams& params, int argc, char** argv) { - bool printHelp = false; bool discard; for (int i = 1; i < argc; ++i) { // if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { @@ -131,8 +130,6 @@ int parse_inputs(LocalParams& params, int argc, char** argv) { ++i; } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", params.verbose)) { - } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { - } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { } else { std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; @@ -140,10 +137,6 @@ int parse_inputs(LocalParams& params, int argc, char** argv) { return 1; } } - if (printHelp) { - print_options(); - return 1; - } return 0; } diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index cee68ef11a..2d03be80ac 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -158,7 +158,6 @@ void print_options() { int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, char** argv) { std::string algoStr; - bool printHelp; for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "--repeat", params.repeat)) { ++i; @@ -276,8 +275,6 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, return 1; } ++i; - } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { - } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { } else { std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; @@ -285,10 +282,6 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, return 1; } } - if (printHelp) { - print_options(); - return 1; - } return 0; } From f779c2d7f45979e9ac9643a04b6291cb1c67b225 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 16 Jun 2023 00:04:04 +0000 Subject: [PATCH 065/231] ONEMKL: working version of spmv mkl backend This should be good for now but eventually more work is needed for the multivector case which will use the gemm function instead of gemv from onemkl Signed-off-by: Berger-Vergiat --- sparse/src/KokkosSparse_Utils_mkl.hpp | 9 ++ sparse/src/KokkosSparse_spmv.hpp | 10 +- ...sSparse_spgemm_symbolic_tpl_spec_avail.hpp | 2 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 39 +++++ .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 150 ++++++++++++++++++ sparse/unit_test/Test_Sparse.hpp | 1 - 6 files changed, 207 insertions(+), 4 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils_mkl.hpp b/sparse/src/KokkosSparse_Utils_mkl.hpp index 0afa75de0a..7a8dd0cb22 100644 --- a/sparse/src/KokkosSparse_Utils_mkl.hpp +++ b/sparse/src/KokkosSparse_Utils_mkl.hpp @@ -230,6 +230,15 @@ inline void MKLSparseMatrix>::export_data( } // namespace Impl } // namespace KokkosSparse +// Utilities for oneMKL SYCL code +#ifdef KOKKOS_ENABLE_SYCL +#include "oneapi/mkl/spblas.hpp" + +namespace KokkosSparse { +namespace Impl {} +} // namespace KokkosSparse +#endif // KOKKOS_ENABLE_SYCL + #endif // KOKKOSKERNELS_ENABLE_TPL_MKL #endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 60fb5331cf..f43ec0bd54 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -171,10 +171,16 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { + if (std::is_same_v) { useFallback = useFallback || (mode[0] == Conjugate[0]); } +#ifdef KOKKOS_ENABLE_SYCL + if (std::is_same_v) { + useFallback = useFallback || (mode[0] == Conjugate[0]); + } +#endif #endif if (useFallback) { diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 1fcfa7132a..b8c545ffe2 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -141,7 +141,7 @@ SPGEMM_SYMBOLIC_AVAIL_MKL_E(Kokkos::Serial) #ifdef KOKKOS_ENABLE_OPENMP SPGEMM_SYMBOLIC_AVAIL_MKL_E(Kokkos::OpenMP) #endif -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 060fef45bb..a8632263f9 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -242,6 +242,45 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif +#ifdef KOKKOS_ENABLE_SYCL +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + const SCALAR, const ORDINAL, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const ORDINAL, const SCALAR*, \ + Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + float, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + double, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) + +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + float, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + double, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) +#endif + #endif // KOKKOSKERNELS_ENABLE_TPL_MKL } // namespace Impl diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index ecbe45c7fd..9b37361e65 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -755,6 +755,156 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #endif #undef KOKKOSSPARSE_SPMV_MKL + +#ifdef KOKKOS_ENABLE_SYCL +inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { + switch (toupper(mode_kk)) { + case 'N': return oneapi::mkl::transpose::nontrans; + case 'T': return oneapi::mkl::transpose::trans; + case 'H': return oneapi::mkl::transpose::conjtrans; + default:; + } + throw std::invalid_argument( + "Invalid mode for oneMKL (should be one of N, T, H)"); +} + +template +struct spmv_onemkl_wrapper {}; + +template <> +struct spmv_onemkl_wrapper { + template + static void spmv(const execution_space& exec, + oneapi::mkl::transpose const mkl_mode, + typename matrix_type::non_const_value_type const alpha, + const matrix_type& A, const xview_type& x, + typename matrix_type::non_const_value_type const beta, + const yview_type& y) { + using scalar_type = typename matrix_type::non_const_value_type; + using ordinal_type = typename matrix_type::non_const_ordinal_type; + + oneapi::mkl::sparse::matrix_handle_t handle = nullptr; + oneapi::mkl::sparse::init_matrix_handle(&handle); + auto ev_set = oneapi::mkl::sparse::set_csr_data( + exec.sycl_queue(), handle, A.numRows(), A.numCols(), + oneapi::mkl::index_base::zero, + const_cast(A.graph.row_map.data()), + const_cast(A.graph.entries.data()), + const_cast(A.values.data())); + auto ev_opt = oneapi::mkl::sparse::optimize_gemv( + exec.sycl_queue(), mkl_mode, handle, {ev_set}); + auto ev_gemv = + oneapi::mkl::sparse::gemv(exec.sycl_queue(), mkl_mode, alpha, handle, + x.data(), beta, y.data(), {ev_opt}); + auto ev_release = oneapi::mkl::sparse::release_matrix_handle( + exec.sycl_queue(), &handle, {ev_gemv}); + ev_release.wait(); + } +}; + +template <> +struct spmv_onemkl_wrapper { + template + static void spmv(const execution_space& exec, + oneapi::mkl::transpose const mkl_mode, + typename matrix_type::non_const_value_type const alpha, + const matrix_type& A, const xview_type& x, + typename matrix_type::non_const_value_type const beta, + const yview_type& y) { + using scalar_type = typename matrix_type::non_const_value_type; + using ordinal_type = typename matrix_type::non_const_ordinal_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + + oneapi::mkl::sparse::matrix_handle_t handle = nullptr; + oneapi::mkl::sparse::init_matrix_handle(&handle); + auto ev_set = oneapi::mkl::sparse::set_csr_data( + exec.sycl_queue(), handle, static_cast(A.numRows()), + static_cast(A.numCols()), oneapi::mkl::index_base::zero, + const_cast(A.graph.row_map.data()), + const_cast(A.graph.entries.data()), + reinterpret_cast*>( + const_cast(A.values.data()))); + auto ev_opt = oneapi::mkl::sparse::optimize_gemv( + exec.sycl_queue(), mkl_mode, handle, {ev_set}); + auto ev_gemv = oneapi::mkl::sparse::gemv( + exec.sycl_queue(), mkl_mode, alpha, handle, + reinterpret_cast*>( + const_cast(x.data())), + beta, reinterpret_cast*>(y.data()), {ev_opt}); + auto ev_release = oneapi::mkl::sparse::release_matrix_handle( + exec.sycl_queue(), &handle, {ev_gemv}); + ev_release.wait(); + } +}; + +#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV, \ + Kokkos::MemoryTraits, ORDINAL const, \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, true, \ + COMPILE_LIBRARY> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using device_type = Kokkos::Device; \ + using AMatrix = \ + CrsMatrix, ORDINAL const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv(const Controls&, const char mode[], \ + const coefficient_type& alpha, const AMatrix& A, \ + const XVector& x, const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ + execution_space exec{}; \ + spmv_onemkl_wrapper::is_complex>::spmv( \ + exec, mkl_mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(double, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_ONEMKL(float, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(double, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif } // namespace Impl } // namespace KokkosSparse #endif diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index e0d0085be1..0dbf7bc759 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -52,7 +52,6 @@ // to actually define tests. #include "Test_Sparse_Utils_cusparse.hpp" - #include "Test_Sparse_rocsparse.hpp" #endif // TEST_SPARSE_HPP From c11cd3319e97de61cefdad8062f28011719ef81a Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 26 Jun 2023 17:35:32 -0600 Subject: [PATCH 066/231] Add missing KokkosKernels_Macros.hpp include (#1884) in batched dense --- batched/KokkosBatched_Util.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 27fb0bf338..614a98dedb 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -35,6 +35,7 @@ #include "Kokkos_Complex.hpp" #include "KokkosKernels_config.h" +#include "KokkosKernels_Macros.hpp" #include "KokkosKernels_SimpleUtils.hpp" #include "KokkosBlas_util.hpp" From 118c133b6fdaf86742338eaf62166f6f5f48da00 Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Wed, 28 Jun 2023 14:47:26 -0600 Subject: [PATCH 067/231] Remove calling tribits_exclude_autotools_files() KokkosKernels is not even calling the wrapper macro kokkoskernels_exclude_autotools_files() which calls tribits_exclude_autotools_files(). KokkosKernels has no autotools files anymore and support for tribits_exclude_autotools_files() was stripped out of TriBITS because it is not needed anymore. There will be a slight hack in Trilinos 'develop' to avoid needing to make this change on Trilinos 'develop'. But the next time KokkosKernels is snapshotted into Trilinos, we can remove that (minor) hack. --- cmake/fake_tribits.cmake | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index 4a44ffea86..52e9c00b72 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -250,14 +250,6 @@ FUNCTION(KOKKOSKERNELS_ADD_ADVANCED_TEST) ENDIF() ENDFUNCTION() -MACRO(KOKKOSKERNELS_EXCLUDE_AUTOTOOLS_FILES) - IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_EXCLUDE_AUTOTOOLS_FILES() - ELSE() - #DO nothing - ENDIF() -ENDMACRO(KOKKOSKERNELS_EXCLUDE_AUTOTOOLS_FILES) - FUNCTION(KOKKOSKERNELS_LIB_TYPE LIB RET) GET_TARGET_PROPERTY(PROP ${LIB} TYPE) IF (${PROP} STREQUAL "INTERFACE_LIBRARY") From 2d2daa5bcaa708ebcd0b15de13443fbfd5aeb808 Mon Sep 17 00:00:00 2001 From: hartsw <103582976+hartsw@users.noreply.github.com> Date: Wed, 28 Jun 2023 22:29:53 -0400 Subject: [PATCH 068/231] Change 'or' to '||' to fix compilation on MSVC (#1885) Co-authored-by: Shane W. D. Hart --- sparse/src/KokkosSparse_spgemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_spgemm.hpp b/sparse/src/KokkosSparse_spgemm.hpp index 882dfd5ec2..b2737a9e2c 100644 --- a/sparse/src/KokkosSparse_spgemm.hpp +++ b/sparse/src/KokkosSparse_spgemm.hpp @@ -167,7 +167,7 @@ template void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { auto blockDim = A.blockDim(); - if (blockDim != B.blockDim() or blockDim != C.blockDim()) { + if (blockDim != B.blockDim() || blockDim != C.blockDim()) { throw std::invalid_argument( "Block SpGEMM must be called for matrices with the same block size"); } From 9b5a1a2c068f66a6c73adcbddfa505b0c809df83 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 28 Jun 2023 20:31:32 -0600 Subject: [PATCH 069/231] cusparse 12 spmv: check y vector alignment (#1889) cusparse 12 requires that the y (output) vector has 16-byte alignment. So check for this and call the native fallback if y has less alignment. This fixes Trilinos #11926. --- sparse/src/KokkosSparse_spmv.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index f43ec0bd54..c663de2860 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -161,6 +161,12 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], useFallback = useFallback || (mode[0] == Conjugate[0]); #endif } + // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for all + // scalar types +#if defined(CUSPARSE_VER_MAJOR) && (CUSPARSE_VER_MAJOR == 12) + uintptr_t yptr = uintptr_t((void*)y.data()); + if (yptr % 16 != 0) useFallback = true; +#endif #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE From 19b30d858afba1b6a5af3e905af6ba02a1d77e85 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 3 Jul 2023 11:22:00 -0600 Subject: [PATCH 070/231] Remove non-existant dir from CMake include paths (#1892) ode/ in the binary dir doesn't exist as a header directory (because the ODE component has no auto-generated headers for ETI). The unit tests might be there, but contain no headers that need to be found. This fixes errors when building against a Trilinos installation, where CMake looks for this directory that doesn't exist. --- ode/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ode/CMakeLists.txt b/ode/CMakeLists.txt index 9d92dc07ba..b9cf089445 100644 --- a/ode/CMakeLists.txt +++ b/ode/CMakeLists.txt @@ -11,5 +11,8 @@ ENDIF() # Adding unit-tests -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) +# Note BMK: Since ODE has no auto-generated ETI files, this directory does not exist in a build without unit tests. +# This causes configure errors when building an app against a Trilinos install, and the unit test build dir doesn't contain any headers that need to be found. +# But uncomment the next line if ETI headers are ever added. +# KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode) From 54d372005cf8d30328f6383f2de8b5a827702b82 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 10:14:32 -0600 Subject: [PATCH 071/231] mdf: move most expensive kernels over to hierarchical parallism --- sparse/impl/KokkosSparse_mdf_impl.hpp | 1348 +++++++++++++++++++------ sparse/src/KokkosSparse_mdf.hpp | 91 +- sparse/unit_test/Test_Sparse_mdf.hpp | 187 +++- 3 files changed, 1267 insertions(+), 359 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index d8754e591c..51f3ae98c3 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -18,6 +18,8 @@ #define KOKKOSSPARSE_MDF_IMPL_HPP_ #include +#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_findRelOffset.hpp" #include #include "Kokkos_ArithTraits.hpp" @@ -63,7 +65,7 @@ struct MDF_count_lower { }; // MDF_count_lower -template +template struct MDF_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = @@ -80,6 +82,7 @@ struct MDF_discarded_fill_norm { crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; + col_ind_type update_list; values_mag_type discarded_fill; col_ind_type deficiency; @@ -89,107 +92,289 @@ struct MDF_discarded_fill_norm { ordinal_type factorization_step_, col_ind_type permutation_, values_mag_type discarded_fill_, - col_ind_type deficiency_, int verbosity_) + col_ind_type deficiency_, int verbosity_, + col_ind_type update_list_ = col_ind_type{}) : A(A_), At(At_), factorization_step(factorization_step_), permutation(permutation_), + update_list(update_list_), discarded_fill(discarded_fill_), deficiency(deficiency_), verbosity(verbosity_){}; - KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(i); - scalar_mag_type discard_norm = KAM::zero(); - scalar_type diag_val = KAS::zero(); - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; - for (size_type alphaIdx = At.graph.row_map(rowIdx); - alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + + struct DiscNormReducer { + using reducer = DiscNormReducer; + struct value_type { + scalar_mag_type discarded_norm; + ordinal_type numFillEntries; + scalar_type diag_val; + }; + using result_view_type = Kokkos::View; + + private: + result_view_type value; + + public: + KOKKOS_INLINE_FUNCTION + DiscNormReducer(value_type& value_) : value(&value_) {} + + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.discarded_norm += src.discarded_norm; + dest.numFillEntries += src.numFillEntries; + if (dest.diag_val == KAS::zero()) dest.diag_val = src.diag_val; + } - if (fillRowIdx != rowIdx && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(rowIdx); - betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.discarded_norm = Kokkos::reduction_identity::sum(); + val.numFillEntries = Kokkos::reduction_identity::sum(); + val.diag_val = KAS::zero(); + } - if (fillColIdx != rowIdx && col_not_eliminated) { - entryIsDiscarded = true; - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - entryIsDiscarded = false; - } - } - if (entryIsDiscarded) { - numFillEntries += 1; - discard_norm += - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - int(At.graph.entries(alphaIdx)), - int(A.graph.entries(betaIdx)), - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), - int(rowIdx)); - } - } - } + KOKKOS_INLINE_FUNCTION + static value_type init() { + value_type out; + init(out); + return out; + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + }; + + KOKKOS_INLINE_FUNCTION + void operator()(team_member_t team) const { + const ordinal_type rowIdx = + is_initial_fill ? permutation(team.league_rank()) + : permutation(update_list(team.league_rank())); + const auto colView = At.rowConst(rowIdx); + const auto rowView = A.rowConst(rowIdx); + + using reduction_val_t = typename DiscNormReducer::value_type; + reduction_val_t reduction_val = DiscNormReducer::init(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, reduction_val_t& running_disc_norm) { + const ordinal_type fillRowIdx = colView.colidx(alpha); + + // Record diagonal term + if (fillRowIdx == rowIdx) { + Kokkos::single(Kokkos::PerThread(team), [&] { + running_disc_norm.diag_val = colView.value(alpha); + }); + return; } - } - } else if (fillRowIdx == rowIdx) { - diag_val = At.values(alphaIdx); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value detected, values(%d)=%f\n", int(rowIdx), - int(alphaIdx), At.values(alphaIdx)); - } else if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value detected, |values(%d)|=%f\n", - int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); + + // Check if row already eliminated + if constexpr (!is_initial_fill) { + bool row_eliminated = false; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, factorization_step), + [&](const ordinal_type stepIdx, bool& running_row_eliminated) { + running_row_eliminated |= fillRowIdx == permutation(stepIdx); + }, + Kokkos::LOr(row_eliminated)); + + if (row_eliminated) return; } - } - } - } - // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / KAS::abs(diag_val * diag_val); - discarded_fill(rowIdx) = discard_norm; - deficiency(rowIdx) = numFillEntries; + const auto fillRowView = A.rowConst(fillRowIdx); + reduction_val_t local_reduction_val = DiscNormReducer::init(); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, rowView.length), + [&](const ordinal_type beta, + reduction_val_t& vect_running_disc_norm) { + const ordinal_type fillColIdx = rowView.colidx(beta); + + if (fillColIdx == rowIdx) return; + + if constexpr (!is_initial_fill) { + bool col_eliminated = false; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; + ++stepIdx) { + col_eliminated |= fillColIdx == permutation(stepIdx); + } - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAM::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); - } - } - } + if (col_eliminated) return; + } + bool entryIsDiscarded = true; + for (ordinal_type gamma = 0; gamma < fillRowView.length; + ++gamma) { + if (fillRowView.colidx(gamma) == fillColIdx) { + entryIsDiscarded = false; + } + } + if (entryIsDiscarded) { + vect_running_disc_norm.numFillEntries += 1; + vect_running_disc_norm.discarded_norm += + KAS::abs(colView.value(alpha) * rowView.value(beta)) * + KAS::abs(colView.value(alpha) * rowView.value(beta)); + } + }, + DiscNormReducer(local_reduction_val)); + + Kokkos::single(Kokkos::PerThread(team), [&] { + running_disc_norm.discarded_norm += + local_reduction_val.discarded_norm; + running_disc_norm.numFillEntries += + local_reduction_val.numFillEntries; + }); + }, + DiscNormReducer(reduction_val)); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + const scalar_mag_type& discard_norm = reduction_val.discarded_norm; + const ordinal_type& numFillEntries = reduction_val.numFillEntries; + const scalar_type& diag_val = reduction_val.diag_val; + + // TODO add a check on `diag_val == zero` + discarded_fill(rowIdx) = discard_norm / KAS::abs(diag_val * diag_val); + deficiency(rowIdx) = numFillEntries; + }); + } }; // MDF_discarded_fill_norm +// template +// struct MDF_discarded_fill_norm_old { +// using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; +// using col_ind_type = +// typename static_crs_graph_type::entries_type::non_const_type; +// using values_type = typename +// crs_matrix_type::values_type::non_const_type; using values_mag_type = +// typename MDF_types::values_mag_type; using size_type +// = typename crs_matrix_type::size_type; using ordinal_type = typename +// crs_matrix_type::ordinal_type; using scalar_type = typename +// crs_matrix_type::value_type; using KAS = typename +// Kokkos::ArithTraits; using scalar_mag_type = typename +// KAS::mag_type; using KAM = typename +// Kokkos::ArithTraits; + +// crs_matrix_type A, At; +// ordinal_type factorization_step; +// col_ind_type permutation; + +// values_mag_type discarded_fill; +// col_ind_type deficiency; +// int verbosity; + +// MDF_discarded_fill_norm_old(crs_matrix_type A_, crs_matrix_type At_, +// ordinal_type factorization_step_, +// col_ind_type permutation_, +// values_mag_type discarded_fill_, +// col_ind_type deficiency_, int verbosity_) +// : A(A_), +// At(At_), +// factorization_step(factorization_step_), +// permutation(permutation_), +// discarded_fill(discarded_fill_), +// deficiency(deficiency_), +// verbosity(verbosity_){}; + +// KOKKOS_INLINE_FUNCTION +// void operator()(const ordinal_type i) const { +// ordinal_type rowIdx = permutation(i); +// scalar_mag_type discard_norm = KAM::zero(); +// scalar_type diag_val = KAS::zero(); +// bool entryIsDiscarded = true; +// ordinal_type numFillEntries = 0; +// for (size_type alphaIdx = At.graph.row_map(rowIdx); +// alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { +// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); +// bool row_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) +// { +// if (fillRowIdx == permutation(stepIdx)) { +// row_not_eliminated = false; +// } +// } + +// if (fillRowIdx != rowIdx && row_not_eliminated) { +// for (size_type betaIdx = A.graph.row_map(rowIdx); +// betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { +// ordinal_type fillColIdx = A.graph.entries(betaIdx); +// bool col_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; +// ++stepIdx) { +// if (fillColIdx == permutation(stepIdx)) { +// col_not_eliminated = false; +// } +// } + +// if (fillColIdx != rowIdx && col_not_eliminated) { +// entryIsDiscarded = true; +// for (size_type entryIdx = A.graph.row_map(fillRowIdx); +// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { +// if (A.graph.entries(entryIdx) == fillColIdx) { +// entryIsDiscarded = false; +// } +// } +// if (entryIsDiscarded) { +// numFillEntries += 1; +// discard_norm += +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); +// if (verbosity > 1) { +// if constexpr (std::is_arithmetic_v) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Adding value A[%d,%d]=%f to discard norm of row %d\n", +// int(At.graph.entries(alphaIdx)), +// int(A.graph.entries(betaIdx)), +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), +// int(rowIdx)); +// } +// } +// } +// } +// } +// } else if (fillRowIdx == rowIdx) { +// diag_val = At.values(alphaIdx); +// if (verbosity > 1) { +// if constexpr (std::is_arithmetic_v) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Row %d diagonal value detected, values(%d)=%f\n", +// int(rowIdx), int(alphaIdx), At.values(alphaIdx)); +// } else if constexpr (std::is_arithmetic_v) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Row %d diagonal value detected, |values(%d)|=%f\n", +// int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); +// } +// } +// } +// } + +// // TODO add a check on `diag_val == zero` +// discard_norm = discard_norm / KAS::abs(diag_val * diag_val); +// discarded_fill(rowIdx) = discard_norm; +// deficiency(rowIdx) = numFillEntries; + +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 0) { +// const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) +// - +// A.graph.row_map(rowIdx) - +// 1); +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Row %d has discarded fill of %f, deficiency of %d and degree +// %d\n", static_cast(rowIdx), +// static_cast(KAM::sqrt(discard_norm)), +// static_cast(deficiency(rowIdx)), static_cast(degree)); +// } +// } +// } + +// }; // MDF_discarded_fill_norm_old + template struct MDF_selective_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; @@ -435,6 +620,14 @@ struct MDF_select_row { }; // MDF_select_row +template +KOKKOS_INLINE_FUNCTION bool sorted_view_contains( + const view_type& values, const ordinal_type size, + typename view_type::const_value_type search_val) { + return KokkosSparse::findRelOffset(values, size, search_val, size, true) != + size; +} + template struct MDF_factorize_row { using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -463,8 +656,14 @@ struct MDF_factorize_row { col_ind_type factored; ordinal_type selected_row_idx, factorization_step; + col_ind_type update_list; + int verbosity; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + MDF_factorize_row(crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, @@ -472,7 +671,8 @@ struct MDF_factorize_row { col_ind_type permutation_, col_ind_type permutation_inv_, values_mag_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, - ordinal_type factorization_step_, int verbosity_) + ordinal_type factorization_step_, + col_ind_type& update_list_, int verbosity_) : A(A_), At(At_), row_mapL(row_mapL_), @@ -487,276 +687,782 @@ struct MDF_factorize_row { factored(factored_), selected_row_idx(selected_row_idx_), factorization_step(factorization_step_), + update_list(update_list_), verbosity(verbosity_){}; + // Phase 2, do facrotization KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type /* idx */) const { - const ordinal_type selected_row = permutation(selected_row_idx); - discarded_fill(selected_row) = Kokkos::ArithTraits::max(); - - // Swap entries in permutation vectors - permutation(selected_row_idx) = permutation(factorization_step); - permutation(factorization_step) = selected_row; - permutation_inv(permutation(factorization_step)) = factorization_step; - permutation_inv(permutation(selected_row_idx)) = selected_row_idx; - - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); - for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(permutation(rowIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - - // Insert the upper part of the selected row in U - // including the diagonal term. - value_type diag = Kokkos::ArithTraits::zero(); - size_type U_entryIdx = row_mapU(factorization_step); - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - if (permutation_inv(A.graph.entries(entryIdx)) >= factorization_step) { - entriesU(U_entryIdx) = A.graph.entries(entryIdx); - valuesU(U_entryIdx) = A.values(entryIdx); - ++U_entryIdx; - if (A.graph.entries(entryIdx) == selected_row) { - diag = A.values(entryIdx); - } - } - } - row_mapU(factorization_step + 1) = U_entryIdx; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - static_cast(selected_row), - static_cast(diag)); - } - - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); - for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; - ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(row_mapU(rowIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesU(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesU(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - - // Insert the lower part of the selected column of A - // divided by its the diagonal value to obtain a unit - // diagonal value in L. - size_type L_entryIdx = row_mapL(factorization_step); - entriesL(L_entryIdx) = selected_row; - valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); - ++L_entryIdx; - for (size_type entryIdx = At.graph.row_map(selected_row); - entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { - if (permutation_inv(At.graph.entries(entryIdx)) > factorization_step) { - entriesL(L_entryIdx) = At.graph.entries(entryIdx); - valuesL(L_entryIdx) = At.values(entryIdx) / diag; - ++L_entryIdx; - } + void operator()(team_member_t team) const { + const auto alpha = team.league_rank(); + const ordinal_type selected_row = permutation(factorization_step); + const auto colView = At.rowConst(selected_row); + + const auto rowInd = colView.colidx(alpha); + if (rowInd == selected_row) return; + + { + bool row_eliminated = false; + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, factorization_step), + [&](const ordinal_type step, bool& partial) { + partial |= rowInd == permutation(step); + }, + Kokkos::LOr(row_eliminated)); + + if (row_eliminated) return; } - row_mapL(factorization_step + 1) = L_entryIdx; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - static_cast(factorization_step), - static_cast(factorization_step), - static_cast(factorization_step + 1), - static_cast(row_mapL(factorization_step)), - static_cast(row_mapL(factorization_step + 1))); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - - // If this was the last row no need to update A and At! - if (factorization_step == A.numRows() - 1) { - return; - } - - // Finally we want to update A and At with the values - // that where not discarded during factorization. - // Note: this is almost the same operation as computing - // the norm of the discarded fill... - - // First step: find the diagonal entry in selected_row - value_type diag_val = Kokkos::ArithTraits::zero(); - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - ordinal_type colIdx = A.graph.entries(entryIdx); - if (selected_row == colIdx) { - diag_val = A.values(entryIdx); - } - } + // Only one of the values will match selected so can just sum all contribs + const auto rowView = A.rowConst(selected_row); + value_type diag = Kokkos::ArithTraits::zero(); + Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team, rowView.length), + [&](const size_type ind, value_type& running_diag) { + if (rowView.colidx(ind) == selected_row) + running_diag = rowView.value(ind); + }, + Kokkos::Sum(diag)); // Extract alpha and beta vectors // Then insert alpha*beta/diag_val if the corresponding // entry in A is non-zero. - for (size_type alphaIdx = At.graph.row_map(selected_row); - alphaIdx < At.graph.row_map(selected_row + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } - - if ((fillRowIdx != selected_row) && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(selected_row); - betaIdx < A.graph.row_map(selected_row + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } - - if ((fillColIdx != selected_row) && col_not_eliminated) { - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - A.values(entryIdx) -= - At.values(alphaIdx) * A.values(betaIdx) / diag_val; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", static_cast(fillRowIdx), - static_cast(fillColIdx), - static_cast(At.values(alphaIdx) * - A.values(betaIdx) / diag_val)); - } - } - } - } - - for (size_type entryIdx = At.graph.row_map(fillColIdx); - entryIdx < At.graph.row_map(fillColIdx + 1); ++entryIdx) { - if (At.graph.entries(entryIdx) == fillRowIdx) { - At.values(entryIdx) -= - At.values(alphaIdx) * A.values(betaIdx) / diag_val; - } - } + auto fillRowView = A.row(rowInd); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const ordinal_type beta) { + const auto colInd = rowView.colidx(beta); + + if (colInd == selected_row) return; + + { + bool col_eliminated = false; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, factorization_step), + [&](const ordinal_type step, bool& partial) { + partial |= colInd == permutation(step); + }, + Kokkos::LOr(col_eliminated)); + + if (col_eliminated) return; } - } - } - } - factored(selected_row) = 1; + const auto subVal = colView.value(alpha) * rowView.value(beta) / diag; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); - for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "%f ", static_cast(A.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); - for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "%f ", static_cast(At.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - } // operator() + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, fillRowView.length), + [&](const ordinal_type gamma) { + if (colInd == fillRowView.colidx(gamma)) { + Kokkos::atomic_sub(&fillRowView.value(gamma), subVal); + } + }); + + auto fillColView = At.row(colInd); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, fillColView.length), + [&](const ordinal_type delt) { + if (rowInd == fillColView.colidx(delt)) { + Kokkos::atomic_sub(&fillColView.value(delt), subVal); + } + }); + }); + } +}; -}; // MDF_factorize_row +// template +// struct MDF_factorize_row_heir_old { +// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: +// row_map_type::non_const_type; +// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: +// entries_type::non_const_type; +// using values_type = typename +// crs_matrix_type::values_type::non_const_type; using ordinal_type = +// typename crs_matrix_type::ordinal_type; using size_type = typename +// crs_matrix_type::size_type; using value_type = typename +// crs_matrix_type::value_type; using values_mag_type = typename +// MDF_types::values_mag_type; using value_mag_type = +// typename values_mag_type::value_type; + +// crs_matrix_type A, At; + +// row_map_type row_mapL; +// col_ind_type entriesL; +// values_type valuesL; + +// row_map_type row_mapU; +// col_ind_type entriesU; +// values_type valuesU; + +// col_ind_type permutation, permutation_inv; +// values_mag_type discarded_fill; +// col_ind_type factored; +// ordinal_type selected_row_idx, factorization_step; + +// col_ind_type update_list; + +// int verbosity; + +// using execution_space = typename crs_matrix_type::execution_space; +// using team_policy_t = Kokkos::TeamPolicy; +// using team_member_t = typename team_policy_t::member_type; + +// MDF_factorize_row_heir_old(crs_matrix_type A_, crs_matrix_type At_, +// row_map_type row_mapL_, col_ind_type entriesL_, +// values_type valuesL_, row_map_type row_mapU_, +// col_ind_type entriesU_, values_type valuesU_, +// col_ind_type permutation_, col_ind_type permutation_inv_, +// values_mag_type discarded_fill_, col_ind_type factored_, +// ordinal_type selected_row_idx_, +// ordinal_type factorization_step_, col_ind_type& +// update_list_, int verbosity_) +// : A(A_), +// At(At_), +// row_mapL(row_mapL_), +// entriesL(entriesL_), +// valuesL(valuesL_), +// row_mapU(row_mapU_), +// entriesU(entriesU_), +// valuesU(valuesU_), +// permutation(permutation_), +// permutation_inv(permutation_inv_), +// discarded_fill(discarded_fill_), +// factored(factored_), +// selected_row_idx(selected_row_idx_), +// factorization_step(factorization_step_), +// update_list(update_list_), +// verbosity(verbosity_){}; + +// //Phase 2, do facrotization +// KOKKOS_INLINE_FUNCTION +// void operator()(team_member_t team) const{ +// const ordinal_type selected_row = permutation(factorization_step); +// const auto rowView = A.rowConst(selected_row); +// const auto colView = At.rowConst(selected_row); + +// // If this was the last row no need to update A and At! +// if (factorization_step == A.numRows() - 1) { +// return; +// } + +// // Only one of the values will match selected so can just sum all +// contribs value_type diag = Kokkos::ArithTraits::zero(); +// Kokkos::parallel_reduce( +// Kokkos::TeamVectorRange(team,rowView.length), +// [&](const size_type alpha,value_type & running_diag){ +// if (rowView.colidx(alpha) == selected_row) +// running_diag = rowView.value(alpha); +// }, +// Kokkos::Sum(diag) +// ); + +// // Extract alpha and beta vectors +// // Then insert alpha*beta/diag_val if the corresponding +// // entry in A is non-zero. +// Kokkos::parallel_for( +// Kokkos::TeamThreadRange(team,colView.length), +// [&](const ordinal_type alpha){ +// const auto rowInd = colView.colidx(alpha); +// auto fillRowView = A.row(rowInd); + +// if (rowInd == selected_row) return; + +// bool row_eliminated = false; +// Kokkos::parallel_reduce( +// Kokkos::ThreadVectorRange(team,factorization_step), +// [&](const ordinal_type step, bool & partial){ +// partial |= rowInd == permutation(step); +// }, +// Kokkos::LOr(row_eliminated) +// ); + +// if (row_eliminated) return; + +// Kokkos::parallel_for( +// Kokkos::ThreadVectorRange(team,rowView.length), +// [&](const ordinal_type beta){ +// const auto colInd = rowView.colidx(beta); + +// if (colInd == selected_row) return; + +// bool col_eliminated = false; +// for (ordinal_type step = 0; step < factorization_step; ++step){ +// col_eliminated |= colInd == permutation(step); +// } + +// if (col_eliminated) return; + +// const auto subVal = colView.colidx(alpha) * rowView.colidx(beta) +// / diag; for (ordinal_type gamma = 0; gamma < fillRowView.length; +// ++gamma){ +// if (colInd == fillRowView.colidx(gamma)){ +// Kokkos::atomic_sub( +// &fillRowView.value(gamma), +// subVal +// ); +// } +// } +// auto fillColView = At.row(colInd); +// for (ordinal_type delt = 0; delt < fillColView.length; ++delt){ +// if (rowInd == fillColView.colidx(delt)){ +// Kokkos::atomic_sub( +// &fillColView.value(delt), +// subVal +// ); +// } +// } +// }); +// } +// ); +// } +// }; template struct MDF_compute_list_length { + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: + row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using size_type = typename crs_matrix_type::size_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + using value_type = typename crs_matrix_type::value_type; + using values_mag_type = typename MDF_types::values_mag_type; + using value_mag_type = typename values_mag_type::value_type; - ordinal_type selected_row_idx; - crs_matrix_type A; - crs_matrix_type At; - col_ind_type permutation; + crs_matrix_type A, At; + + row_map_type row_mapL; + col_ind_type entriesL; + values_type valuesL; + + row_map_type row_mapU; + col_ind_type entriesU; + values_type valuesU; + + col_ind_type permutation, permutation_inv; + values_mag_type discarded_fill; col_ind_type factored; - col_ind_type update_list_length; + ordinal_type selected_row_idx, factorization_step; + col_ind_type update_list; - MDF_compute_list_length(const ordinal_type rowIdx_, const crs_matrix_type& A_, - const crs_matrix_type& At_, - const col_ind_type& permutation_, - const col_ind_type factored_, - col_ind_type& update_list_length_, - col_ind_type& update_list_) - : selected_row_idx(rowIdx_), - A(A_), + int verbosity; + + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + + MDF_compute_list_length( + crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, + col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, + col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, + col_ind_type permutation_inv_, values_mag_type discarded_fill_, + col_ind_type factored_, ordinal_type selected_row_idx_, + ordinal_type factorization_step_, col_ind_type& update_list_, + int verbosity_) + : A(A_), At(At_), + row_mapL(row_mapL_), + entriesL(entriesL_), + valuesL(valuesL_), + row_mapU(row_mapU_), + entriesU(entriesU_), + valuesU(valuesU_), permutation(permutation_), + permutation_inv(permutation_inv_), + discarded_fill(discarded_fill_), factored(factored_), - update_list_length(update_list_length_), - update_list(update_list_) {} + selected_row_idx(selected_row_idx_), + factorization_step(factorization_step_), + update_list(update_list_), + verbosity(verbosity_){}; + // Phase 1, update list length KOKKOS_INLINE_FUNCTION - void operator()(const size_type /*idx*/) const { + void operator()(const team_member_t team, ordinal_type& update_list_len, + ordinal_type& selected_row_len) const { const ordinal_type selected_row = permutation(selected_row_idx); - size_type updateIdx = 0; - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - if ((A.graph.entries(entryIdx) != selected_row) && - (factored(A.graph.entries(entryIdx)) != 1)) { - update_list(updateIdx) = A.graph.entries(entryIdx); - ++updateIdx; - } + const auto rowView = A.rowConst(selected_row); + const auto colView = At.rowConst(selected_row); + + size_type U_entryIdx = row_mapU(factorization_step); + size_type L_entryIdx = row_mapL(factorization_step); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); + + // Swap entries in permutation vectors + permutation(selected_row_idx) = permutation(factorization_step); + permutation(factorization_step) = selected_row; + permutation_inv(permutation(factorization_step)) = factorization_step; + permutation_inv(permutation(selected_row_idx)) = selected_row_idx; + + // Diagonal value of L + entriesL(L_entryIdx) = selected_row; + valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); + }); + ++L_entryIdx; + + // Insert the upper part of the selected row in U + // including the diagonal term. + ordinal_type updateIdx = 0; + value_type diag = Kokkos::ArithTraits::zero(); + { + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type alpha, ordinal_type& running_update, + bool is_final) { + const auto colInd = rowView.colidx(alpha); + if ((colInd != selected_row) && (factored(colInd) != 1)) { + if (is_final) { + update_list(running_update) = colInd; + ++updateIdx; + } + ++running_update; + } + } + // ,updateIdx + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(updateIdx)); + + // Sort update list + KokkosKernels::TeamBitonicSort(&update_list(0), updateIdx, team); } - size_type update_rows = updateIdx; - for (size_type entryIdx = At.graph.row_map(selected_row); - entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { - if ((At.graph.entries(entryIdx) != selected_row) && - (factored(A.graph.entries(entryIdx)) != 1)) { - bool already_updated = false; - for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { - if (At.graph.entries(entryIdx) == update_list(checkIdx)) { - already_updated = true; - break; + { + size_type numEntrU = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type alpha, size_type& running_nEntr, bool is_final) { + const auto colInd = rowView.colidx(alpha); + if (permutation_inv(colInd) >= factorization_step) { + if (is_final) { + ++numEntrU; + entriesU(U_entryIdx + running_nEntr) = colInd; + valuesU(U_entryIdx + running_nEntr) = rowView.value(alpha); + if (colInd == selected_row) diag = rowView.value(alpha); + } + ++running_nEntr; + } } - } - if (already_updated == false) { - update_list(updateIdx) = At.graph.entries(entryIdx); - ++updateIdx; - } - } + // , numEntrU + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numEntrU)); + + U_entryIdx += numEntrU; } - update_list_length(0) = updateIdx; + + // Only one thread found diagonal so just sum over all + team.team_reduce(Kokkos::Sum(diag)); + + // Insert the lower part of the selected column of A + // divided by its the diagonal value to obtain a unit + // diagonal value in L. + { + size_type numEntrL = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, size_type& running_nEntr, bool is_final) { + const auto rowInd = colView.colidx(alpha); + if (permutation_inv(rowInd) > factorization_step) { + if (is_final) { + ++numEntrL; + entriesL(L_entryIdx + running_nEntr) = rowInd; + valuesL(L_entryIdx + running_nEntr) = + colView.value(alpha) / diag; + } + ++running_nEntr; + } + } + // , numEntrL + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numEntrL)); + + L_entryIdx += numEntrL; + } + { + ordinal_type numUpdateL = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, ordinal_type& running_update, + bool is_final) { + const auto rowInd = colView.colidx(alpha); + if ((rowInd != selected_row) && (factored(rowInd) != 1)) { + // updateIdx currently holds the rows that were updated. don't add + // duplicates + const size_type& update_rows = updateIdx; + + const bool already_updated = + sorted_view_contains(update_list, update_rows, rowInd); + + if (!already_updated) { + // Cannot make use of vector ranges until + // https://github.com/kokkos/kokkos/issues/6259 is resolved + // Kokkos::single(Kokkos::PerThread(team),[&]{ + if (is_final) { + update_list(updateIdx + running_update) = rowInd; + ++numUpdateL; + } + ++running_update; + // }); + } + } + } + // , numUpdateL + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numUpdateL)); + + updateIdx += numUpdateL; + } + + Kokkos::single(Kokkos::PerTeam(team), [&] { + row_mapU(factorization_step + 1) = U_entryIdx; + row_mapL(factorization_step + 1) = L_entryIdx; + + update_list_len = updateIdx; + selected_row_len = rowView.length; + + factored(selected_row) = 1; + }); } }; +// template +// struct MDF_factorize_row_old { +// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: +// row_map_type::non_const_type; +// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: +// entries_type::non_const_type; +// using values_type = typename +// crs_matrix_type::values_type::non_const_type; using ordinal_type = +// typename crs_matrix_type::ordinal_type; using size_type = typename +// crs_matrix_type::size_type; using value_type = typename +// crs_matrix_type::value_type; using values_mag_type = typename +// MDF_types::values_mag_type; using value_mag_type = +// typename values_mag_type::value_type; + +// crs_matrix_type A, At; + +// row_map_type row_mapL; +// col_ind_type entriesL; +// values_type valuesL; + +// row_map_type row_mapU; +// col_ind_type entriesU; +// values_type valuesU; + +// col_ind_type permutation, permutation_inv; +// values_mag_type discarded_fill; +// col_ind_type factored; +// ordinal_type selected_row_idx, factorization_step; + +// int verbosity; + +// MDF_factorize_row_old(crs_matrix_type A_, crs_matrix_type At_, +// row_map_type row_mapL_, col_ind_type entriesL_, +// values_type valuesL_, row_map_type row_mapU_, +// col_ind_type entriesU_, values_type valuesU_, +// col_ind_type permutation_, col_ind_type permutation_inv_, +// values_mag_type discarded_fill_, col_ind_type factored_, +// ordinal_type selected_row_idx_, +// ordinal_type factorization_step_, int verbosity_) +// : A(A_), +// At(At_), +// row_mapL(row_mapL_), +// entriesL(entriesL_), +// valuesL(valuesL_), +// row_mapU(row_mapU_), +// entriesU(entriesU_), +// valuesU(valuesU_), +// permutation(permutation_), +// permutation_inv(permutation_inv_), +// discarded_fill(discarded_fill_), +// factored(factored_), +// selected_row_idx(selected_row_idx_), +// factorization_step(factorization_step_), +// verbosity(verbosity_){}; + +// KOKKOS_INLINE_FUNCTION +// void operator()(const ordinal_type /* idx */) const { +// const ordinal_type selected_row = permutation(selected_row_idx); +// discarded_fill(selected_row) = +// Kokkos::ArithTraits::max(); + +// // Swap entries in permutation vectors +// permutation(selected_row_idx) = permutation(factorization_step); +// permutation(factorization_step) = selected_row; +// permutation_inv(permutation(factorization_step)) = factorization_step; +// permutation_inv(permutation(selected_row_idx)) = selected_row_idx; + +// if (verbosity > 0) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); +// for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(permutation(rowIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } + +// // Insert the upper part of the selected row in U +// // including the diagonal term. +// value_type diag = Kokkos::ArithTraits::zero(); +// size_type U_entryIdx = row_mapU(factorization_step); +// for (size_type entryIdx = A.graph.row_map(selected_row); +// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { +// if (permutation_inv(A.graph.entries(entryIdx)) >= factorization_step) { +// entriesU(U_entryIdx) = A.graph.entries(entryIdx); +// valuesU(U_entryIdx) = A.values(entryIdx); +// ++U_entryIdx; +// if (A.graph.entries(entryIdx) == selected_row) { +// diag = A.values(entryIdx); +// } +// } +// } +// row_mapU(factorization_step + 1) = U_entryIdx; +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 0) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", +// static_cast(selected_row), +// static_cast(diag)); +// } + +// if (verbosity > 2) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); +// for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; +// ++rowIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(row_mapU(rowIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); +// for (size_type entryIdx = row_mapU(0); +// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(entriesU(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); +// for (size_type entryIdx = row_mapU(0); +// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", +// static_cast(valuesU(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } +// } + +// // Insert the lower part of the selected column of A +// // divided by its the diagonal value to obtain a unit +// // diagonal value in L. +// size_type L_entryIdx = row_mapL(factorization_step); +// entriesL(L_entryIdx) = selected_row; +// valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); +// ++L_entryIdx; +// for (size_type entryIdx = At.graph.row_map(selected_row); +// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { +// if (permutation_inv(At.graph.entries(entryIdx)) > factorization_step) { +// entriesL(L_entryIdx) = At.graph.entries(entryIdx); +// valuesL(L_entryIdx) = At.values(entryIdx) / diag; +// ++L_entryIdx; +// } +// } +// row_mapL(factorization_step + 1) = L_entryIdx; + +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 2) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", +// static_cast(factorization_step), +// static_cast(factorization_step), +// static_cast(factorization_step + 1), +// static_cast(row_mapL(factorization_step)), +// static_cast(row_mapL(factorization_step + 1))); +// for (size_type entryIdx = row_mapL(factorization_step); +// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(entriesL(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); +// for (size_type entryIdx = row_mapL(factorization_step); +// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", +// static_cast(valuesL(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } +// } + +// // If this was the last row no need to update A and At! +// if (factorization_step == A.numRows() - 1) { +// return; +// } + +// // Finally we want to update A and At with the values +// // that where not discarded during factorization. +// // Note: this is almost the same operation as computing +// // the norm of the discarded fill... + +// // First step: find the diagonal entry in selected_row +// value_type diag_val = Kokkos::ArithTraits::zero(); +// for (size_type entryIdx = A.graph.row_map(selected_row); +// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { +// ordinal_type colIdx = A.graph.entries(entryIdx); +// if (selected_row == colIdx) { +// diag_val = A.values(entryIdx); +// } +// } + +// // Extract alpha and beta vectors +// // Then insert alpha*beta/diag_val if the corresponding +// // entry in A is non-zero. +// for (size_type alphaIdx = At.graph.row_map(selected_row); +// alphaIdx < At.graph.row_map(selected_row + 1); ++alphaIdx) { +// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); +// bool row_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) +// { +// if (fillRowIdx == permutation(stepIdx)) { +// row_not_eliminated = false; +// } +// } + +// if ((fillRowIdx != selected_row) && row_not_eliminated) { +// for (size_type betaIdx = A.graph.row_map(selected_row); +// betaIdx < A.graph.row_map(selected_row + 1); ++betaIdx) { +// ordinal_type fillColIdx = A.graph.entries(betaIdx); +// bool col_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; +// ++stepIdx) { +// if (fillColIdx == permutation(stepIdx)) { +// col_not_eliminated = false; +// } +// } + +// if ((fillColIdx != selected_row) && col_not_eliminated) { +// for (size_type entryIdx = A.graph.row_map(fillRowIdx); +// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { +// if (A.graph.entries(entryIdx) == fillColIdx) { +// A.values(entryIdx) -= +// At.values(alphaIdx) * A.values(betaIdx) / diag_val; +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 1) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "A[%d, %d] -= %f\n", static_cast(fillRowIdx), +// static_cast(fillColIdx), +// static_cast(At.values(alphaIdx) * +// A.values(betaIdx) / diag_val)); +// } +// } +// } +// } + +// for (size_type entryIdx = At.graph.row_map(fillColIdx); +// entryIdx < At.graph.row_map(fillColIdx + 1); ++entryIdx) { +// if (At.graph.entries(entryIdx) == fillRowIdx) { +// At.values(entryIdx) -= +// At.values(alphaIdx) * A.values(betaIdx) / diag_val; +// } +// } +// } +// } +// } +// } + +// factored(selected_row) = 1; + +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 0) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); +// for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "%f ", static_cast(A.values(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); +// for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "%f ", static_cast(At.values(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } +// } +// } // operator() + +// }; // MDF_factorize_row_old + +// template +// struct MDF_compute_list_length_old { +// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: +// entries_type::non_const_type; +// using ordinal_type = typename crs_matrix_type::ordinal_type; +// using size_type = typename crs_matrix_type::size_type; + +// ordinal_type selected_row_idx; +// crs_matrix_type A; +// crs_matrix_type At; +// col_ind_type permutation; +// col_ind_type factored; +// col_ind_type update_list_length; +// col_ind_type update_list; + +// MDF_compute_list_length_old(const ordinal_type rowIdx_, const +// crs_matrix_type& A_, +// const crs_matrix_type& At_, +// const col_ind_type& permutation_, +// const col_ind_type factored_, +// col_ind_type& update_list_length_, +// col_ind_type& update_list_) +// : selected_row_idx(rowIdx_), +// A(A_), +// At(At_), +// permutation(permutation_), +// factored(factored_), +// update_list_length(update_list_length_), +// update_list(update_list_) {} + +// KOKKOS_INLINE_FUNCTION +// void operator()(const size_type /*idx*/) const { +// const ordinal_type selected_row = permutation(selected_row_idx); + +// size_type updateIdx = 0; +// for (size_type entryIdx = A.graph.row_map(selected_row); +// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { +// if ((A.graph.entries(entryIdx) != selected_row) && +// (factored(A.graph.entries(entryIdx)) != 1)) { +// update_list(updateIdx) = A.graph.entries(entryIdx); +// ++updateIdx; +// } +// } +// size_type update_rows = updateIdx; +// for (size_type entryIdx = At.graph.row_map(selected_row); +// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { +// if ((At.graph.entries(entryIdx) != selected_row) && +// (factored(A.graph.entries(entryIdx)) != 1)) { +// bool already_updated = false; +// for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { +// if (At.graph.entries(entryIdx) == update_list(checkIdx)) { +// already_updated = true; +// break; +// } +// } +// if (already_updated == false) { +// update_list(updateIdx) = At.graph.entries(entryIdx); +// ++updateIdx; +// } +// } +// } +// update_list_length(0) = updateIdx; +// } +// }; + template struct MDF_reindex_matrix { col_ind_type permutation_inv; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 1c5216bfe5..a69e7a0e75 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -73,6 +73,7 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; + using team_range_policy_type = Kokkos::TeamPolicy; // Numerical phase: // loop over rows @@ -85,19 +86,19 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { KokkosSparse::sort_crs_matrix(At); values_mag_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); - col_ind_type update_list_length("update list length", 1); - typename col_ind_type::HostMirror update_list_length_host = - Kokkos::create_mirror_view(update_list_length); + ordinal_type update_list_len = 0; col_ind_type update_list("update list", A.numRows()); col_ind_type factored("factored rows", A.numRows()); Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); - KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( - Atmp, At, 0, handle.permutation, discarded_fill, deficiency, - verbosity_level); - Kokkos::parallel_for("MDF: initial fill computation", - range_policy_type(0, Atmp.numRows()), MDF_df_norm); + KokkosSparse::Impl::MDF_discarded_fill_norm + MDF_df_norm(Atmp, At, 0, handle.permutation, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for( + "MDF: initial fill computation", + team_range_policy_type(Atmp.numRows(), Kokkos::AUTO, Kokkos::AUTO), + MDF_df_norm); for (ordinal_type factorization_step = 0; factorization_step < A.numRows(); ++factorization_step) { @@ -106,36 +107,54 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { static_cast(factorization_step)); } - Kokkos::deep_copy(update_list_length_host, update_list_length); - range_policy_type updatePolicy(0, update_list_length_host(0)); - KokkosSparse::Impl::MDF_selective_discarded_fill_norm - MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, - update_list, discarded_fill, deficiency, - verbosity_level); - Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, - MDF_update_df_norm); + { + team_range_policy_type updatePolicy(update_list_len, Kokkos::AUTO, + Kokkos::AUTO); + KokkosSparse::Impl::MDF_discarded_fill_norm + MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, + discarded_fill, deficiency, verbosity_level, + update_list); + Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, + MDF_update_df_norm); + } - range_policy_type stepPolicy(factorization_step, Atmp.numRows()); ordinal_type selected_row_idx = 0; - KokkosSparse::Impl::MDF_select_row MDF_row_selector( - factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, - handle.permutation); - Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, - selected_row_idx); - - KokkosSparse::Impl::MDF_compute_list_length - compute_list_length(selected_row_idx, Atmp, At, handle.permutation, - factored, update_list_length, update_list); - Kokkos::parallel_for("MDF: compute update list", range_policy_type(0, 1), - compute_list_length); - - KokkosSparse::Impl::MDF_factorize_row factorize_row( - Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, - handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, discarded_fill, factored, selected_row_idx, - factorization_step, verbosity_level); - Kokkos::parallel_for("MDF: factorize row", range_policy_type(0, 1), - factorize_row); + { + range_policy_type stepPolicy(factorization_step, Atmp.numRows()); + KokkosSparse::Impl::MDF_select_row MDF_row_selector( + factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, + handle.permutation); + Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, + selected_row_idx); + } + + ordinal_type selected_row_len = 0; + { + team_range_policy_type updateListPolicy( + 1, Kokkos::AUTO); // (vector overloads required for scans to use + // vector parallel not provided by kokkos yet) + KokkosSparse::Impl::MDF_compute_list_length updateList( + Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, + handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, + handle.permutation_inv, discarded_fill, factored, selected_row_idx, + factorization_step, update_list, verbosity_level); + update_list_len = 0; + Kokkos::parallel_reduce("MDF: compute update list", updateListPolicy, + updateList, update_list_len, selected_row_len); + } + + // If this was the last row no need to update A and At! + if (factorization_step < A.numRows() - 1) { + team_range_policy_type factorizePolicy(selected_row_len, Kokkos::AUTO, + Kokkos::AUTO); + KokkosSparse::Impl::MDF_factorize_row factorize_row( + Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, + handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, + handle.permutation_inv, discarded_fill, factored, selected_row_idx, + factorization_step, update_list, verbosity_level); + Kokkos::parallel_for("MDF: factorize row", factorizePolicy, + factorize_row); + } if (verbosity_level > 0) { printf("\n"); diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index f6e4d0bc84..67aee2cbdc 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -16,14 +16,197 @@ #include #include - #include "KokkosSparse_mdf.hpp" +#include "KokkosSparse_CrsMatrix.hpp" namespace Test { +// void foo(){ + +// // const value_type four = static_cast(4.0); + +// constexpr ordinal_type numRows = 100; +// constexpr ordinal_type numCols = numRows; +// row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), +// numRows + 1); Kokkos::deep_copy(row_map,0); + +// constexpr value_type perc_fill = 0.3; +// constexpr size_type targetNonZerosPerRow = numRows*perc_fill; +// constexpr value_type num_fill_scl = 0.6; + +// Kokkos::Random_XorShift64_Pool random(13718 + 3); +// Kokkos::fill_random(row_map, random, +// size_type(targetNonZerosPerRow*num_fill_scl), +// value_type(targetNonZerosPerRow/num_fill_scl)); + +// size_type numNonZeros = 0; +// Kokkos::parallel_scan( +// Kokkos::RangePolicy(0,numRows+1), +// KOKKOS_LAMBDA(ordinal_type i,bool is_final,size_type & runningNZ){ +// if (is_final) { +// const auto curr_val = row_map[i]; +// row_map[i] = runningNZ; +// if (i < numRows) runningNZ += curr_val; +// } +// else { +// runningNZ += row_map[i]; +// } +// }, +// numNonZeros +// ); + +// // constexpr size_type numNonZeros = 64; +// // row_map_type row_map("row map", numRows + 1); +// col_ind_type col_ind("column indices", numNonZeros); +// values_type values("values", numNonZeros); +// Kokkos::fill_random(values, random, value_type(1.0), value_type(10.)); + +// } + +template +KokkosSparse::CrsMatrix +make_adv_diffusion_matrix(const scalar_type beta, const scalar_type vel_mag, + const size_type Nx, const size_type Ny) { + using crs_matrix_type = KokkosSparse::CrsMatrix; + using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; + using row_map_type = typename crs_graph_type::row_map_type::non_const_type; + using col_ind_type = typename crs_graph_type::entries_type::non_const_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using value_type = typename crs_matrix_type::value_type; + using execution_space = typename crs_matrix_type::execution_space; + + const ordinal_type numRows = Nx * Ny; + const ordinal_type& numCols = numRows; + row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), + numRows + 1); + + ordinal_type numNonZeros = 0; + Kokkos::parallel_scan( + Kokkos::RangePolicy(ordinal_type(0), + ordinal_type(numRows + 1)), + KOKKOS_LAMBDA(ordinal_type i, ordinal_type & runningNZ, bool is_final) { + const auto curr_val = (i == 0) ? 1 : 5; + if (is_final) row_map[i] = runningNZ; + if (i < numRows) runningNZ += curr_val; + }, + numNonZeros); + + col_ind_type col_ind("column indices", numNonZeros); + values_type values("values", numNonZeros); + Kokkos::parallel_for( + Kokkos::MDRangePolicy >({0, 0}, {Nx, Ny}), + KOKKOS_LAMBDA(ordinal_type iX, ordinal_type iY) { + const ordinal_type row_XY = iX + Nx * iY; + auto map_ind = row_map(row_XY); + if (row_XY == 0) { + col_ind(map_ind) = row_XY; + values(map_ind) = 1.; + return; + } + + const ordinal_type nX = (iX + Nx - 1) % Nx; + const ordinal_type pX = (iX + 1) % Nx; + const ordinal_type nY = (iY + Ny - 1) % Ny; + const ordinal_type pY = (iY + 1) % Ny; + + const ordinal_type row_pXY = pX + Nx * iY; + const ordinal_type row_nXY = nX + Nx * iY; + const ordinal_type row_XpY = iX + Nx * pY; + const ordinal_type row_XnY = iX + Nx * nY; + + // Negative y dir + col_ind(map_ind) = row_XnY; + values(map_ind) = beta; + ++map_ind; + // Negative x dir + col_ind(map_ind) = row_nXY; + values(map_ind) = beta - vel_mag; + ++map_ind; + // Middle + col_ind(map_ind) = row_XY; + values(map_ind) = -4.0 * beta + vel_mag; + ++map_ind; + // Positive x dir + col_ind(map_ind) = row_pXY; + values(map_ind) = beta; + ++map_ind; + // Positive y dir + col_ind(map_ind) = row_XpY; + values(map_ind) = beta; + }); + + return crs_matrix_type("A", numRows, numCols, numNonZeros, values, row_map, + col_ind); +} + +template +void run_test_mdf_recr_issue() { // + + // using execution_space = Kokkos::Serial; + using execution_space = typename device::execution_space; + + constexpr int num_teams = 10; + constexpr int num_per_team = 10; + Kokkos::View m_data( + Kokkos::ViewAllocateWithoutInitializing("data"), num_teams, num_per_team); + Kokkos::View m_num_entr( + Kokkos::ViewAllocateWithoutInitializing("data"), num_teams); + + using team_policy_t = Kokkos::TeamPolicy; + using member_t = typename team_policy_t::member_type; + + Kokkos::parallel_for(team_policy_t(num_teams, Kokkos::AUTO, Kokkos::AUTO), + KOKKOS_LAMBDA(member_t team) { + const auto iTeam = team.league_rank(); + + // int num_added; + Kokkos::parallel_scan( + Kokkos::TeamVectorRange(team, num_per_team), + [&](int i, int& partial_num, bool final) { + if (final) m_data(iTeam, i) = partial_num; + partial_num += i; + }); + + // // Do something with num_entr ... + // Kokkos::single(Kokkos::PerTeam(team),[&]{ + // m_num_entr(iTeam) = num_added; + // }); + }); +} + +template +void run_test_mdf() { //_timing + using crs_matrix_type = KokkosSparse::CrsMatrix; + using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; + using row_map_type = typename crs_graph_type::row_map_type::non_const_type; + using col_ind_type = typename crs_graph_type::entries_type::non_const_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using value_type = typename crs_matrix_type::value_type; + using execution_space = typename crs_matrix_type::execution_space; + + const scalar_type beta = 1.0; + const scalar_type vel_mag = 0.5; + const size_type Nx = 400; + const size_type Ny = 400; + + crs_matrix_type A = + make_adv_diffusion_matrix( + beta, vel_mag, Nx, Ny); + + KokkosSparse::Experimental::MDF_handle handle(A); + handle.set_verbosity(0); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + KokkosSparse::Experimental::mdf_numeric(A, handle); +} + template -void run_test_mdf() { +void run_test_mdf_real() { // using crs_matrix_type = KokkosSparse::CrsMatrix; using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; From 64c08fcab406a91323635ee7580f1d453e072c47 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 12:22:20 -0600 Subject: [PATCH 072/231] move to unordered_set for factored rows --- sparse/impl/KokkosSparse_mdf_impl.hpp | 828 ++------------------------ sparse/src/KokkosSparse_mdf.hpp | 28 +- sparse/unit_test/Test_Sparse_mdf.hpp | 185 +----- 3 files changed, 59 insertions(+), 982 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 51f3ae98c3..1042c453f9 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -18,6 +18,7 @@ #define KOKKOSSPARSE_MDF_IMPL_HPP_ #include +#include #include "KokkosKernels_Sorting.hpp" #include "KokkosSparse_findRelOffset.hpp" #include @@ -67,6 +68,8 @@ struct MDF_count_lower { template struct MDF_discarded_fill_norm { + using device_type = typename crs_matrix_type::device_type; + using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; @@ -78,10 +81,13 @@ struct MDF_discarded_fill_norm { using KAS = typename Kokkos::ArithTraits; using scalar_mag_type = typename KAS::mag_type; using KAM = typename Kokkos::ArithTraits; + using permutation_set_type = + Kokkos::UnorderedMap; crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; + permutation_set_type permutation_set; col_ind_type update_list; values_mag_type discarded_fill; @@ -91,6 +97,7 @@ struct MDF_discarded_fill_norm { MDF_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, ordinal_type factorization_step_, col_ind_type permutation_, + permutation_set_type permutation_set_, values_mag_type discarded_fill_, col_ind_type deficiency_, int verbosity_, col_ind_type update_list_ = col_ind_type{}) @@ -98,6 +105,7 @@ struct MDF_discarded_fill_norm { At(At_), factorization_step(factorization_step_), permutation(permutation_), + permutation_set(permutation_set_), update_list(update_list_), discarded_fill(discarded_fill_), deficiency(deficiency_), @@ -176,15 +184,7 @@ struct MDF_discarded_fill_norm { // Check if row already eliminated if constexpr (!is_initial_fill) { - bool row_eliminated = false; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, factorization_step), - [&](const ordinal_type stepIdx, bool& running_row_eliminated) { - running_row_eliminated |= fillRowIdx == permutation(stepIdx); - }, - Kokkos::LOr(row_eliminated)); - - if (row_eliminated) return; + if (permutation_set.exists(fillRowIdx)) return; } const auto fillRowView = A.rowConst(fillRowIdx); @@ -198,13 +198,7 @@ struct MDF_discarded_fill_norm { if (fillColIdx == rowIdx) return; if constexpr (!is_initial_fill) { - bool col_eliminated = false; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - col_eliminated |= fillColIdx == permutation(stepIdx); - } - - if (col_eliminated) return; + if (permutation_set.exists(fillColIdx)) return; } bool entryIsDiscarded = true; @@ -244,270 +238,6 @@ struct MDF_discarded_fill_norm { } }; // MDF_discarded_fill_norm -// template -// struct MDF_discarded_fill_norm_old { -// using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; -// using col_ind_type = -// typename static_crs_graph_type::entries_type::non_const_type; -// using values_type = typename -// crs_matrix_type::values_type::non_const_type; using values_mag_type = -// typename MDF_types::values_mag_type; using size_type -// = typename crs_matrix_type::size_type; using ordinal_type = typename -// crs_matrix_type::ordinal_type; using scalar_type = typename -// crs_matrix_type::value_type; using KAS = typename -// Kokkos::ArithTraits; using scalar_mag_type = typename -// KAS::mag_type; using KAM = typename -// Kokkos::ArithTraits; - -// crs_matrix_type A, At; -// ordinal_type factorization_step; -// col_ind_type permutation; - -// values_mag_type discarded_fill; -// col_ind_type deficiency; -// int verbosity; - -// MDF_discarded_fill_norm_old(crs_matrix_type A_, crs_matrix_type At_, -// ordinal_type factorization_step_, -// col_ind_type permutation_, -// values_mag_type discarded_fill_, -// col_ind_type deficiency_, int verbosity_) -// : A(A_), -// At(At_), -// factorization_step(factorization_step_), -// permutation(permutation_), -// discarded_fill(discarded_fill_), -// deficiency(deficiency_), -// verbosity(verbosity_){}; - -// KOKKOS_INLINE_FUNCTION -// void operator()(const ordinal_type i) const { -// ordinal_type rowIdx = permutation(i); -// scalar_mag_type discard_norm = KAM::zero(); -// scalar_type diag_val = KAS::zero(); -// bool entryIsDiscarded = true; -// ordinal_type numFillEntries = 0; -// for (size_type alphaIdx = At.graph.row_map(rowIdx); -// alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { -// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); -// bool row_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) -// { -// if (fillRowIdx == permutation(stepIdx)) { -// row_not_eliminated = false; -// } -// } - -// if (fillRowIdx != rowIdx && row_not_eliminated) { -// for (size_type betaIdx = A.graph.row_map(rowIdx); -// betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { -// ordinal_type fillColIdx = A.graph.entries(betaIdx); -// bool col_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; -// ++stepIdx) { -// if (fillColIdx == permutation(stepIdx)) { -// col_not_eliminated = false; -// } -// } - -// if (fillColIdx != rowIdx && col_not_eliminated) { -// entryIsDiscarded = true; -// for (size_type entryIdx = A.graph.row_map(fillRowIdx); -// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { -// if (A.graph.entries(entryIdx) == fillColIdx) { -// entryIsDiscarded = false; -// } -// } -// if (entryIsDiscarded) { -// numFillEntries += 1; -// discard_norm += -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); -// if (verbosity > 1) { -// if constexpr (std::is_arithmetic_v) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Adding value A[%d,%d]=%f to discard norm of row %d\n", -// int(At.graph.entries(alphaIdx)), -// int(A.graph.entries(betaIdx)), -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), -// int(rowIdx)); -// } -// } -// } -// } -// } -// } else if (fillRowIdx == rowIdx) { -// diag_val = At.values(alphaIdx); -// if (verbosity > 1) { -// if constexpr (std::is_arithmetic_v) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Row %d diagonal value detected, values(%d)=%f\n", -// int(rowIdx), int(alphaIdx), At.values(alphaIdx)); -// } else if constexpr (std::is_arithmetic_v) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Row %d diagonal value detected, |values(%d)|=%f\n", -// int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); -// } -// } -// } -// } - -// // TODO add a check on `diag_val == zero` -// discard_norm = discard_norm / KAS::abs(diag_val * diag_val); -// discarded_fill(rowIdx) = discard_norm; -// deficiency(rowIdx) = numFillEntries; - -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 0) { -// const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) -// - -// A.graph.row_map(rowIdx) - -// 1); -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Row %d has discarded fill of %f, deficiency of %d and degree -// %d\n", static_cast(rowIdx), -// static_cast(KAM::sqrt(discard_norm)), -// static_cast(deficiency(rowIdx)), static_cast(degree)); -// } -// } -// } - -// }; // MDF_discarded_fill_norm_old - -template -struct MDF_selective_discarded_fill_norm { - using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; - using col_ind_type = - typename static_crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; - using KAS = typename Kokkos::ArithTraits; - using scalar_mag_type = typename KAS::mag_type; - using KAM = typename Kokkos::ArithTraits; - using values_mag_type = typename MDF_types::values_mag_type; - - crs_matrix_type A, At; - ordinal_type factorization_step; - col_ind_type permutation; - col_ind_type update_list; - - values_mag_type discarded_fill; - col_ind_type deficiency; - int verbosity; - - MDF_selective_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, - ordinal_type factorization_step_, - col_ind_type permutation_, - col_ind_type update_list_, - values_mag_type discarded_fill_, - col_ind_type deficiency_, int verbosity_) - : A(A_), - At(At_), - factorization_step(factorization_step_), - permutation(permutation_), - update_list(update_list_), - discarded_fill(discarded_fill_), - deficiency(deficiency_), - verbosity(verbosity_){}; - - KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(update_list(i)); - scalar_mag_type discard_norm = KAM::zero(); - scalar_type diag_val = KAS::zero(); - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; - for (size_type alphaIdx = At.graph.row_map(rowIdx); - alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } - - if (fillRowIdx != rowIdx && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(rowIdx); - betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } - - if (fillColIdx != rowIdx && col_not_eliminated) { - entryIsDiscarded = true; - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - entryIsDiscarded = false; - } - } - if (entryIsDiscarded) { - numFillEntries += 1; - discard_norm += - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - static_cast(At.graph.entries(alphaIdx)), - static_cast(A.graph.entries(betaIdx)), - static_cast( - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), - static_cast(rowIdx)); - } - } - } - } - } - } else if (fillRowIdx == rowIdx) { - diag_val = At.values(alphaIdx); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, values(%d)=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(At.values(alphaIdx))); - } else if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, |values(%d)|=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(KAS::abs(At.values(alphaIdx)))); - } - } - } - } - - // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / KAS::abs(diag_val * diag_val); - discarded_fill(rowIdx) = discard_norm; - deficiency(rowIdx) = numFillEntries; - - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAM::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); - } - } - } - -}; // MDF_selective_discarded_fill_norm - template struct MDF_select_row { using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -630,6 +360,11 @@ KOKKOS_INLINE_FUNCTION bool sorted_view_contains( template struct MDF_factorize_row { + using device_type = typename crs_matrix_type::device_type; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -640,6 +375,8 @@ struct MDF_factorize_row { using value_type = typename crs_matrix_type::value_type; using values_mag_type = typename MDF_types::values_mag_type; using value_mag_type = typename values_mag_type::value_type; + using permutation_set_type = + Kokkos::UnorderedMap; crs_matrix_type A, At; @@ -652,6 +389,7 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; + permutation_set_type permutation_set; values_mag_type discarded_fill; col_ind_type factored; ordinal_type selected_row_idx, factorization_step; @@ -660,15 +398,12 @@ struct MDF_factorize_row { int verbosity; - using execution_space = typename crs_matrix_type::execution_space; - using team_policy_t = Kokkos::TeamPolicy; - using team_member_t = typename team_policy_t::member_type; - MDF_factorize_row(crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, + permutation_set_type permutation_set_, values_mag_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, ordinal_type factorization_step_, @@ -683,6 +418,7 @@ struct MDF_factorize_row { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + permutation_set(permutation_set_), discarded_fill(discarded_fill_), factored(factored_), selected_row_idx(selected_row_idx_), @@ -700,17 +436,7 @@ struct MDF_factorize_row { const auto rowInd = colView.colidx(alpha); if (rowInd == selected_row) return; - { - bool row_eliminated = false; - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(team, factorization_step), - [&](const ordinal_type step, bool& partial) { - partial |= rowInd == permutation(step); - }, - Kokkos::LOr(row_eliminated)); - - if (row_eliminated) return; - } + if (permutation_set.exists(rowInd)) return; // Only one of the values will match selected so can just sum all contribs const auto rowView = A.rowConst(selected_row); @@ -733,17 +459,7 @@ struct MDF_factorize_row { if (colInd == selected_row) return; - { - bool col_eliminated = false; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, factorization_step), - [&](const ordinal_type step, bool& partial) { - partial |= colInd == permutation(step); - }, - Kokkos::LOr(col_eliminated)); - - if (col_eliminated) return; - } + if (permutation_set.exists(colInd)) return; const auto subVal = colView.value(alpha) * rowView.value(beta) / diag; @@ -767,155 +483,13 @@ struct MDF_factorize_row { } }; -// template -// struct MDF_factorize_row_heir_old { -// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: -// row_map_type::non_const_type; -// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: -// entries_type::non_const_type; -// using values_type = typename -// crs_matrix_type::values_type::non_const_type; using ordinal_type = -// typename crs_matrix_type::ordinal_type; using size_type = typename -// crs_matrix_type::size_type; using value_type = typename -// crs_matrix_type::value_type; using values_mag_type = typename -// MDF_types::values_mag_type; using value_mag_type = -// typename values_mag_type::value_type; - -// crs_matrix_type A, At; - -// row_map_type row_mapL; -// col_ind_type entriesL; -// values_type valuesL; - -// row_map_type row_mapU; -// col_ind_type entriesU; -// values_type valuesU; - -// col_ind_type permutation, permutation_inv; -// values_mag_type discarded_fill; -// col_ind_type factored; -// ordinal_type selected_row_idx, factorization_step; - -// col_ind_type update_list; - -// int verbosity; - -// using execution_space = typename crs_matrix_type::execution_space; -// using team_policy_t = Kokkos::TeamPolicy; -// using team_member_t = typename team_policy_t::member_type; - -// MDF_factorize_row_heir_old(crs_matrix_type A_, crs_matrix_type At_, -// row_map_type row_mapL_, col_ind_type entriesL_, -// values_type valuesL_, row_map_type row_mapU_, -// col_ind_type entriesU_, values_type valuesU_, -// col_ind_type permutation_, col_ind_type permutation_inv_, -// values_mag_type discarded_fill_, col_ind_type factored_, -// ordinal_type selected_row_idx_, -// ordinal_type factorization_step_, col_ind_type& -// update_list_, int verbosity_) -// : A(A_), -// At(At_), -// row_mapL(row_mapL_), -// entriesL(entriesL_), -// valuesL(valuesL_), -// row_mapU(row_mapU_), -// entriesU(entriesU_), -// valuesU(valuesU_), -// permutation(permutation_), -// permutation_inv(permutation_inv_), -// discarded_fill(discarded_fill_), -// factored(factored_), -// selected_row_idx(selected_row_idx_), -// factorization_step(factorization_step_), -// update_list(update_list_), -// verbosity(verbosity_){}; - -// //Phase 2, do facrotization -// KOKKOS_INLINE_FUNCTION -// void operator()(team_member_t team) const{ -// const ordinal_type selected_row = permutation(factorization_step); -// const auto rowView = A.rowConst(selected_row); -// const auto colView = At.rowConst(selected_row); - -// // If this was the last row no need to update A and At! -// if (factorization_step == A.numRows() - 1) { -// return; -// } - -// // Only one of the values will match selected so can just sum all -// contribs value_type diag = Kokkos::ArithTraits::zero(); -// Kokkos::parallel_reduce( -// Kokkos::TeamVectorRange(team,rowView.length), -// [&](const size_type alpha,value_type & running_diag){ -// if (rowView.colidx(alpha) == selected_row) -// running_diag = rowView.value(alpha); -// }, -// Kokkos::Sum(diag) -// ); - -// // Extract alpha and beta vectors -// // Then insert alpha*beta/diag_val if the corresponding -// // entry in A is non-zero. -// Kokkos::parallel_for( -// Kokkos::TeamThreadRange(team,colView.length), -// [&](const ordinal_type alpha){ -// const auto rowInd = colView.colidx(alpha); -// auto fillRowView = A.row(rowInd); - -// if (rowInd == selected_row) return; - -// bool row_eliminated = false; -// Kokkos::parallel_reduce( -// Kokkos::ThreadVectorRange(team,factorization_step), -// [&](const ordinal_type step, bool & partial){ -// partial |= rowInd == permutation(step); -// }, -// Kokkos::LOr(row_eliminated) -// ); - -// if (row_eliminated) return; - -// Kokkos::parallel_for( -// Kokkos::ThreadVectorRange(team,rowView.length), -// [&](const ordinal_type beta){ -// const auto colInd = rowView.colidx(beta); - -// if (colInd == selected_row) return; - -// bool col_eliminated = false; -// for (ordinal_type step = 0; step < factorization_step; ++step){ -// col_eliminated |= colInd == permutation(step); -// } - -// if (col_eliminated) return; - -// const auto subVal = colView.colidx(alpha) * rowView.colidx(beta) -// / diag; for (ordinal_type gamma = 0; gamma < fillRowView.length; -// ++gamma){ -// if (colInd == fillRowView.colidx(gamma)){ -// Kokkos::atomic_sub( -// &fillRowView.value(gamma), -// subVal -// ); -// } -// } -// auto fillColView = At.row(colInd); -// for (ordinal_type delt = 0; delt < fillColView.length; ++delt){ -// if (rowInd == fillColView.colidx(delt)){ -// Kokkos::atomic_sub( -// &fillColView.value(delt), -// subVal -// ); -// } -// } -// }); -// } -// ); -// } -// }; - template struct MDF_compute_list_length { + using device_type = typename crs_matrix_type::device_type; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -927,6 +501,9 @@ struct MDF_compute_list_length { using values_mag_type = typename MDF_types::values_mag_type; using value_mag_type = typename values_mag_type::value_type; + using permutation_set_type = + Kokkos::UnorderedMap; + crs_matrix_type A, At; row_map_type row_mapL; @@ -938,6 +515,7 @@ struct MDF_compute_list_length { values_type valuesU; col_ind_type permutation, permutation_inv; + permutation_set_type permutation_set; values_mag_type discarded_fill; col_ind_type factored; ordinal_type selected_row_idx, factorization_step; @@ -946,18 +524,14 @@ struct MDF_compute_list_length { int verbosity; - using execution_space = typename crs_matrix_type::execution_space; - using team_policy_t = Kokkos::TeamPolicy; - using team_member_t = typename team_policy_t::member_type; - MDF_compute_list_length( crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, - col_ind_type permutation_inv_, values_mag_type discarded_fill_, - col_ind_type factored_, ordinal_type selected_row_idx_, - ordinal_type factorization_step_, col_ind_type& update_list_, - int verbosity_) + col_ind_type permutation_inv_, permutation_set_type permutation_set_, + values_mag_type discarded_fill_, col_ind_type factored_, + ordinal_type selected_row_idx_, ordinal_type factorization_step_, + col_ind_type& update_list_, int verbosity_) : A(A_), At(At_), row_mapL(row_mapL_), @@ -968,6 +542,7 @@ struct MDF_compute_list_length { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + permutation_set(permutation_set_), discarded_fill(discarded_fill_), factored(factored_), selected_row_idx(selected_row_idx_), @@ -999,6 +574,11 @@ struct MDF_compute_list_length { // Diagonal value of L entriesL(L_entryIdx) = selected_row; valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); + + // Insert into permutation set for later + const auto res = permutation_set.insert(selected_row); + (void)res; // avoid unused error + assert(res.success()); }); ++L_entryIdx; @@ -1137,332 +717,6 @@ struct MDF_compute_list_length { } }; -// template -// struct MDF_factorize_row_old { -// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: -// row_map_type::non_const_type; -// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: -// entries_type::non_const_type; -// using values_type = typename -// crs_matrix_type::values_type::non_const_type; using ordinal_type = -// typename crs_matrix_type::ordinal_type; using size_type = typename -// crs_matrix_type::size_type; using value_type = typename -// crs_matrix_type::value_type; using values_mag_type = typename -// MDF_types::values_mag_type; using value_mag_type = -// typename values_mag_type::value_type; - -// crs_matrix_type A, At; - -// row_map_type row_mapL; -// col_ind_type entriesL; -// values_type valuesL; - -// row_map_type row_mapU; -// col_ind_type entriesU; -// values_type valuesU; - -// col_ind_type permutation, permutation_inv; -// values_mag_type discarded_fill; -// col_ind_type factored; -// ordinal_type selected_row_idx, factorization_step; - -// int verbosity; - -// MDF_factorize_row_old(crs_matrix_type A_, crs_matrix_type At_, -// row_map_type row_mapL_, col_ind_type entriesL_, -// values_type valuesL_, row_map_type row_mapU_, -// col_ind_type entriesU_, values_type valuesU_, -// col_ind_type permutation_, col_ind_type permutation_inv_, -// values_mag_type discarded_fill_, col_ind_type factored_, -// ordinal_type selected_row_idx_, -// ordinal_type factorization_step_, int verbosity_) -// : A(A_), -// At(At_), -// row_mapL(row_mapL_), -// entriesL(entriesL_), -// valuesL(valuesL_), -// row_mapU(row_mapU_), -// entriesU(entriesU_), -// valuesU(valuesU_), -// permutation(permutation_), -// permutation_inv(permutation_inv_), -// discarded_fill(discarded_fill_), -// factored(factored_), -// selected_row_idx(selected_row_idx_), -// factorization_step(factorization_step_), -// verbosity(verbosity_){}; - -// KOKKOS_INLINE_FUNCTION -// void operator()(const ordinal_type /* idx */) const { -// const ordinal_type selected_row = permutation(selected_row_idx); -// discarded_fill(selected_row) = -// Kokkos::ArithTraits::max(); - -// // Swap entries in permutation vectors -// permutation(selected_row_idx) = permutation(factorization_step); -// permutation(factorization_step) = selected_row; -// permutation_inv(permutation(factorization_step)) = factorization_step; -// permutation_inv(permutation(selected_row_idx)) = selected_row_idx; - -// if (verbosity > 0) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); -// for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(permutation(rowIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } - -// // Insert the upper part of the selected row in U -// // including the diagonal term. -// value_type diag = Kokkos::ArithTraits::zero(); -// size_type U_entryIdx = row_mapU(factorization_step); -// for (size_type entryIdx = A.graph.row_map(selected_row); -// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { -// if (permutation_inv(A.graph.entries(entryIdx)) >= factorization_step) { -// entriesU(U_entryIdx) = A.graph.entries(entryIdx); -// valuesU(U_entryIdx) = A.values(entryIdx); -// ++U_entryIdx; -// if (A.graph.entries(entryIdx) == selected_row) { -// diag = A.values(entryIdx); -// } -// } -// } -// row_mapU(factorization_step + 1) = U_entryIdx; -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 0) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", -// static_cast(selected_row), -// static_cast(diag)); -// } - -// if (verbosity > 2) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); -// for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; -// ++rowIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(row_mapU(rowIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); -// for (size_type entryIdx = row_mapU(0); -// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(entriesU(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); -// for (size_type entryIdx = row_mapU(0); -// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", -// static_cast(valuesU(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } -// } - -// // Insert the lower part of the selected column of A -// // divided by its the diagonal value to obtain a unit -// // diagonal value in L. -// size_type L_entryIdx = row_mapL(factorization_step); -// entriesL(L_entryIdx) = selected_row; -// valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); -// ++L_entryIdx; -// for (size_type entryIdx = At.graph.row_map(selected_row); -// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { -// if (permutation_inv(At.graph.entries(entryIdx)) > factorization_step) { -// entriesL(L_entryIdx) = At.graph.entries(entryIdx); -// valuesL(L_entryIdx) = At.values(entryIdx) / diag; -// ++L_entryIdx; -// } -// } -// row_mapL(factorization_step + 1) = L_entryIdx; - -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 2) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", -// static_cast(factorization_step), -// static_cast(factorization_step), -// static_cast(factorization_step + 1), -// static_cast(row_mapL(factorization_step)), -// static_cast(row_mapL(factorization_step + 1))); -// for (size_type entryIdx = row_mapL(factorization_step); -// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(entriesL(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); -// for (size_type entryIdx = row_mapL(factorization_step); -// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", -// static_cast(valuesL(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } -// } - -// // If this was the last row no need to update A and At! -// if (factorization_step == A.numRows() - 1) { -// return; -// } - -// // Finally we want to update A and At with the values -// // that where not discarded during factorization. -// // Note: this is almost the same operation as computing -// // the norm of the discarded fill... - -// // First step: find the diagonal entry in selected_row -// value_type diag_val = Kokkos::ArithTraits::zero(); -// for (size_type entryIdx = A.graph.row_map(selected_row); -// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { -// ordinal_type colIdx = A.graph.entries(entryIdx); -// if (selected_row == colIdx) { -// diag_val = A.values(entryIdx); -// } -// } - -// // Extract alpha and beta vectors -// // Then insert alpha*beta/diag_val if the corresponding -// // entry in A is non-zero. -// for (size_type alphaIdx = At.graph.row_map(selected_row); -// alphaIdx < At.graph.row_map(selected_row + 1); ++alphaIdx) { -// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); -// bool row_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) -// { -// if (fillRowIdx == permutation(stepIdx)) { -// row_not_eliminated = false; -// } -// } - -// if ((fillRowIdx != selected_row) && row_not_eliminated) { -// for (size_type betaIdx = A.graph.row_map(selected_row); -// betaIdx < A.graph.row_map(selected_row + 1); ++betaIdx) { -// ordinal_type fillColIdx = A.graph.entries(betaIdx); -// bool col_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; -// ++stepIdx) { -// if (fillColIdx == permutation(stepIdx)) { -// col_not_eliminated = false; -// } -// } - -// if ((fillColIdx != selected_row) && col_not_eliminated) { -// for (size_type entryIdx = A.graph.row_map(fillRowIdx); -// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { -// if (A.graph.entries(entryIdx) == fillColIdx) { -// A.values(entryIdx) -= -// At.values(alphaIdx) * A.values(betaIdx) / diag_val; -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 1) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "A[%d, %d] -= %f\n", static_cast(fillRowIdx), -// static_cast(fillColIdx), -// static_cast(At.values(alphaIdx) * -// A.values(betaIdx) / diag_val)); -// } -// } -// } -// } - -// for (size_type entryIdx = At.graph.row_map(fillColIdx); -// entryIdx < At.graph.row_map(fillColIdx + 1); ++entryIdx) { -// if (At.graph.entries(entryIdx) == fillRowIdx) { -// At.values(entryIdx) -= -// At.values(alphaIdx) * A.values(betaIdx) / diag_val; -// } -// } -// } -// } -// } -// } - -// factored(selected_row) = 1; - -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 0) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); -// for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "%f ", static_cast(A.values(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); -// for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "%f ", static_cast(At.values(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } -// } -// } // operator() - -// }; // MDF_factorize_row_old - -// template -// struct MDF_compute_list_length_old { -// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: -// entries_type::non_const_type; -// using ordinal_type = typename crs_matrix_type::ordinal_type; -// using size_type = typename crs_matrix_type::size_type; - -// ordinal_type selected_row_idx; -// crs_matrix_type A; -// crs_matrix_type At; -// col_ind_type permutation; -// col_ind_type factored; -// col_ind_type update_list_length; -// col_ind_type update_list; - -// MDF_compute_list_length_old(const ordinal_type rowIdx_, const -// crs_matrix_type& A_, -// const crs_matrix_type& At_, -// const col_ind_type& permutation_, -// const col_ind_type factored_, -// col_ind_type& update_list_length_, -// col_ind_type& update_list_) -// : selected_row_idx(rowIdx_), -// A(A_), -// At(At_), -// permutation(permutation_), -// factored(factored_), -// update_list_length(update_list_length_), -// update_list(update_list_) {} - -// KOKKOS_INLINE_FUNCTION -// void operator()(const size_type /*idx*/) const { -// const ordinal_type selected_row = permutation(selected_row_idx); - -// size_type updateIdx = 0; -// for (size_type entryIdx = A.graph.row_map(selected_row); -// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { -// if ((A.graph.entries(entryIdx) != selected_row) && -// (factored(A.graph.entries(entryIdx)) != 1)) { -// update_list(updateIdx) = A.graph.entries(entryIdx); -// ++updateIdx; -// } -// } -// size_type update_rows = updateIdx; -// for (size_type entryIdx = At.graph.row_map(selected_row); -// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { -// if ((At.graph.entries(entryIdx) != selected_row) && -// (factored(A.graph.entries(entryIdx)) != 1)) { -// bool already_updated = false; -// for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { -// if (At.graph.entries(entryIdx) == update_list(checkIdx)) { -// already_updated = true; -// break; -// } -// } -// if (already_updated == false) { -// update_list(updateIdx) = At.graph.entries(entryIdx); -// ++updateIdx; -// } -// } -// } -// update_list_length(0) = updateIdx; -// } -// }; - template struct MDF_reindex_matrix { col_ind_type permutation_inv; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index a69e7a0e75..272180debe 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -27,6 +27,7 @@ #ifndef KOKKOSSPARSE_MDF_HPP_ #define KOKKOSSPARSE_MDF_HPP_ +#include #include "KokkosSparse_mdf_handle.hpp" #include "KokkosSparse_mdf_impl.hpp" @@ -71,10 +72,14 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using ordinal_type = typename crs_matrix_type::ordinal_type; using value_mag_type = typename values_mag_type::value_type; + using device_type = typename crs_matrix_type::device_type; using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; using team_range_policy_type = Kokkos::TeamPolicy; + using permutation_set_type = + Kokkos::UnorderedMap; + // Numerical phase: // loop over rows // compute discarded fill of each row @@ -91,10 +96,11 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { col_ind_type factored("factored rows", A.numRows()); Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); + permutation_set_type permutation_set(A.numRows()); KokkosSparse::Impl::MDF_discarded_fill_norm - MDF_df_norm(Atmp, At, 0, handle.permutation, discarded_fill, deficiency, - verbosity_level); + MDF_df_norm(Atmp, At, 0, handle.permutation, permutation_set, + discarded_fill, deficiency, verbosity_level); Kokkos::parallel_for( "MDF: initial fill computation", team_range_policy_type(Atmp.numRows(), Kokkos::AUTO, Kokkos::AUTO), @@ -112,8 +118,8 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { Kokkos::AUTO); KokkosSparse::Impl::MDF_discarded_fill_norm MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, - discarded_fill, deficiency, verbosity_level, - update_list); + permutation_set, discarded_fill, deficiency, + verbosity_level, update_list); Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, MDF_update_df_norm); } @@ -130,14 +136,14 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { ordinal_type selected_row_len = 0; { - team_range_policy_type updateListPolicy( - 1, Kokkos::AUTO); // (vector overloads required for scans to use - // vector parallel not provided by kokkos yet) + // vector overloads required for scans to use vector parallel not yet + // provided by kokkos (https://github.com/kokkos/kokkos/issues/6259) + team_range_policy_type updateListPolicy(1, Kokkos::AUTO); KokkosSparse::Impl::MDF_compute_list_length updateList( Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, discarded_fill, factored, selected_row_idx, - factorization_step, update_list, verbosity_level); + handle.permutation_inv, permutation_set, discarded_fill, factored, + selected_row_idx, factorization_step, update_list, verbosity_level); update_list_len = 0; Kokkos::parallel_reduce("MDF: compute update list", updateListPolicy, updateList, update_list_len, selected_row_len); @@ -150,8 +156,8 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { KokkosSparse::Impl::MDF_factorize_row factorize_row( Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, discarded_fill, factored, selected_row_idx, - factorization_step, update_list, verbosity_level); + handle.permutation_inv, permutation_set, discarded_fill, factored, + selected_row_idx, factorization_step, update_list, verbosity_level); Kokkos::parallel_for("MDF: factorize row", factorizePolicy, factorize_row); } diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index 67aee2cbdc..4b5b65aeb3 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -21,192 +21,9 @@ namespace Test { -// void foo(){ - -// // const value_type four = static_cast(4.0); - -// constexpr ordinal_type numRows = 100; -// constexpr ordinal_type numCols = numRows; -// row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), -// numRows + 1); Kokkos::deep_copy(row_map,0); - -// constexpr value_type perc_fill = 0.3; -// constexpr size_type targetNonZerosPerRow = numRows*perc_fill; -// constexpr value_type num_fill_scl = 0.6; - -// Kokkos::Random_XorShift64_Pool random(13718 + 3); -// Kokkos::fill_random(row_map, random, -// size_type(targetNonZerosPerRow*num_fill_scl), -// value_type(targetNonZerosPerRow/num_fill_scl)); - -// size_type numNonZeros = 0; -// Kokkos::parallel_scan( -// Kokkos::RangePolicy(0,numRows+1), -// KOKKOS_LAMBDA(ordinal_type i,bool is_final,size_type & runningNZ){ -// if (is_final) { -// const auto curr_val = row_map[i]; -// row_map[i] = runningNZ; -// if (i < numRows) runningNZ += curr_val; -// } -// else { -// runningNZ += row_map[i]; -// } -// }, -// numNonZeros -// ); - -// // constexpr size_type numNonZeros = 64; -// // row_map_type row_map("row map", numRows + 1); -// col_ind_type col_ind("column indices", numNonZeros); -// values_type values("values", numNonZeros); -// Kokkos::fill_random(values, random, value_type(1.0), value_type(10.)); - -// } - -template -KokkosSparse::CrsMatrix -make_adv_diffusion_matrix(const scalar_type beta, const scalar_type vel_mag, - const size_type Nx, const size_type Ny) { - using crs_matrix_type = KokkosSparse::CrsMatrix; - using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; - using row_map_type = typename crs_graph_type::row_map_type::non_const_type; - using col_ind_type = typename crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using value_type = typename crs_matrix_type::value_type; - using execution_space = typename crs_matrix_type::execution_space; - - const ordinal_type numRows = Nx * Ny; - const ordinal_type& numCols = numRows; - row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), - numRows + 1); - - ordinal_type numNonZeros = 0; - Kokkos::parallel_scan( - Kokkos::RangePolicy(ordinal_type(0), - ordinal_type(numRows + 1)), - KOKKOS_LAMBDA(ordinal_type i, ordinal_type & runningNZ, bool is_final) { - const auto curr_val = (i == 0) ? 1 : 5; - if (is_final) row_map[i] = runningNZ; - if (i < numRows) runningNZ += curr_val; - }, - numNonZeros); - - col_ind_type col_ind("column indices", numNonZeros); - values_type values("values", numNonZeros); - Kokkos::parallel_for( - Kokkos::MDRangePolicy >({0, 0}, {Nx, Ny}), - KOKKOS_LAMBDA(ordinal_type iX, ordinal_type iY) { - const ordinal_type row_XY = iX + Nx * iY; - auto map_ind = row_map(row_XY); - if (row_XY == 0) { - col_ind(map_ind) = row_XY; - values(map_ind) = 1.; - return; - } - - const ordinal_type nX = (iX + Nx - 1) % Nx; - const ordinal_type pX = (iX + 1) % Nx; - const ordinal_type nY = (iY + Ny - 1) % Ny; - const ordinal_type pY = (iY + 1) % Ny; - - const ordinal_type row_pXY = pX + Nx * iY; - const ordinal_type row_nXY = nX + Nx * iY; - const ordinal_type row_XpY = iX + Nx * pY; - const ordinal_type row_XnY = iX + Nx * nY; - - // Negative y dir - col_ind(map_ind) = row_XnY; - values(map_ind) = beta; - ++map_ind; - // Negative x dir - col_ind(map_ind) = row_nXY; - values(map_ind) = beta - vel_mag; - ++map_ind; - // Middle - col_ind(map_ind) = row_XY; - values(map_ind) = -4.0 * beta + vel_mag; - ++map_ind; - // Positive x dir - col_ind(map_ind) = row_pXY; - values(map_ind) = beta; - ++map_ind; - // Positive y dir - col_ind(map_ind) = row_XpY; - values(map_ind) = beta; - }); - - return crs_matrix_type("A", numRows, numCols, numNonZeros, values, row_map, - col_ind); -} - -template -void run_test_mdf_recr_issue() { // - - // using execution_space = Kokkos::Serial; - using execution_space = typename device::execution_space; - - constexpr int num_teams = 10; - constexpr int num_per_team = 10; - Kokkos::View m_data( - Kokkos::ViewAllocateWithoutInitializing("data"), num_teams, num_per_team); - Kokkos::View m_num_entr( - Kokkos::ViewAllocateWithoutInitializing("data"), num_teams); - - using team_policy_t = Kokkos::TeamPolicy; - using member_t = typename team_policy_t::member_type; - - Kokkos::parallel_for(team_policy_t(num_teams, Kokkos::AUTO, Kokkos::AUTO), - KOKKOS_LAMBDA(member_t team) { - const auto iTeam = team.league_rank(); - - // int num_added; - Kokkos::parallel_scan( - Kokkos::TeamVectorRange(team, num_per_team), - [&](int i, int& partial_num, bool final) { - if (final) m_data(iTeam, i) = partial_num; - partial_num += i; - }); - - // // Do something with num_entr ... - // Kokkos::single(Kokkos::PerTeam(team),[&]{ - // m_num_entr(iTeam) = num_added; - // }); - }); -} - -template -void run_test_mdf() { //_timing - using crs_matrix_type = KokkosSparse::CrsMatrix; - using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; - using row_map_type = typename crs_graph_type::row_map_type::non_const_type; - using col_ind_type = typename crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using value_type = typename crs_matrix_type::value_type; - using execution_space = typename crs_matrix_type::execution_space; - - const scalar_type beta = 1.0; - const scalar_type vel_mag = 0.5; - const size_type Nx = 400; - const size_type Ny = 400; - - crs_matrix_type A = - make_adv_diffusion_matrix( - beta, vel_mag, Nx, Ny); - - KokkosSparse::Experimental::MDF_handle handle(A); - handle.set_verbosity(0); - KokkosSparse::Experimental::mdf_symbolic(A, handle); - KokkosSparse::Experimental::mdf_numeric(A, handle); -} - template -void run_test_mdf_real() { // +void run_test_mdf() { using crs_matrix_type = KokkosSparse::CrsMatrix; using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; From 0eae1f225967a2ae03ee9c0df202363cac270dad Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 12:58:31 -0600 Subject: [PATCH 073/231] fix clangformat --- sparse/impl/KokkosSparse_mdf_impl.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 1042c453f9..4383279ad0 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -441,12 +441,13 @@ struct MDF_factorize_row { // Only one of the values will match selected so can just sum all contribs const auto rowView = A.rowConst(selected_row); value_type diag = Kokkos::ArithTraits::zero(); - Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team, rowView.length), - [&](const size_type ind, value_type& running_diag) { - if (rowView.colidx(ind) == selected_row) - running_diag = rowView.value(ind); - }, - Kokkos::Sum(diag)); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, rowView.length), + [&](const size_type ind, value_type& running_diag) { + if (rowView.colidx(ind) == selected_row) + running_diag = rowView.value(ind); + }, + Kokkos::Sum(diag)); // Extract alpha and beta vectors // Then insert alpha*beta/diag_val if the corresponding From 3868be1490b9ede841e4f1eaa58e2674050900a2 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 16:35:01 -0600 Subject: [PATCH 074/231] MDF: convert remaining count functor to hierarchical parallelism --- sparse/impl/KokkosSparse_mdf_impl.hpp | 32 ++++++++++++++++++--------- sparse/src/KokkosSparse_mdf.hpp | 9 ++++---- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index d8754e591c..a893a47aff 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -40,27 +40,39 @@ struct MDF_count_lower { entries_type::non_const_type; using size_type = typename crs_matrix_type::ordinal_type; using value_type = typename crs_matrix_type::size_type; + using KAV = typename Kokkos::ArithTraits; crs_matrix_type A; col_ind_type permutation; col_ind_type permutation_inv; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + MDF_count_lower(crs_matrix_type A_, col_ind_type permutation_, col_ind_type permutation_inv_) : A(A_), permutation(permutation_), permutation_inv(permutation_inv_){}; KOKKOS_INLINE_FUNCTION - void operator()(const size_type rowIdx, value_type& update) const { - permutation(rowIdx) = rowIdx; - permutation_inv(rowIdx) = rowIdx; - for (value_type entryIdx = A.graph.row_map(rowIdx); - entryIdx < A.graph.row_map(rowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) <= rowIdx) { - update += 1; - } - } + void operator()(const team_member_t team, value_type& update) const { + const auto rowIdx = team.league_rank(); + const auto rowView = A.graph.rowConst(rowIdx); + + value_type local_contrib = KAV::zero(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type entryIdx, value_type& partial) { + if (rowView(entryIdx) <= rowIdx) partial += 1; + }, + Kokkos::Sum(local_contrib)); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + permutation(rowIdx) = rowIdx; + permutation_inv(rowIdx) = rowIdx; + update += local_contrib; + }); } - }; // MDF_count_lower template diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 1c5216bfe5..4b5c611d42 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -38,19 +38,18 @@ void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; - using execution_space = typename crs_matrix_type::execution_space; - using range_policy_type = Kokkos::RangePolicy; + using execution_space = typename crs_matrix_type::execution_space; + using team_range_policy_type = Kokkos::TeamPolicy; // Symbolic phase: // compute transpose of A for easy access to columns of A // allocate temporaries // allocate L and U size_type nnzL = 0, nnzU = 0; - range_policy_type setupPolicy(0, A.numRows()); + team_range_policy_type setupPolicy(A.numRows(), Kokkos::AUTO); KokkosSparse::Impl::MDF_count_lower compute_nnzL( A, handle.permutation, handle.permutation_inv); - Kokkos::parallel_reduce(range_policy_type(0, A.numRows()), compute_nnzL, - nnzL); + Kokkos::parallel_reduce(setupPolicy, compute_nnzL, nnzL); nnzU = A.nnz() - nnzL + A.numRows(); handle.allocate_data(nnzL, nnzU); From 2ad5683b470e85941999ac47aaf38aae5a4d47e7 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 17:50:24 -0600 Subject: [PATCH 075/231] remove unused typedef --- sparse/src/KokkosSparse_mdf.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 4b5c611d42..17e20d9951 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -35,8 +35,7 @@ namespace Experimental { template void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; using execution_space = typename crs_matrix_type::execution_space; using team_range_policy_type = Kokkos::TeamPolicy; From d58ac7d90fa66ecf701ec49ba52b9377dadb646b Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 10 Apr 2023 13:00:58 -0600 Subject: [PATCH 076/231] LowerBound: fully-qualified types --- common/src/KokkosKernels_LowerBound.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp index 22df9545ef..160bd496f3 100644 --- a/common/src/KokkosKernels_LowerBound.hpp +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -168,7 +168,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, + static_assert(KokkosKernels::Impl::is_iota_v || + Kokkos::is_view::value, "lower_bound_thread requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); /* @@ -448,7 +449,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_team( const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_team requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, + static_assert(KokkosKernels::Impl::is_iota_v || + Kokkos::is_view::value, "lower_bound_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); From 984886d4502293409fecf7f5baa11b6d43ba805a Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 10 Apr 2023 13:01:43 -0600 Subject: [PATCH 077/231] Iota: std::remove_const -> std::remove_const_t --- common/src/KokkosKernels_Iota.hpp | 2 +- common/unit_test/Test_Common_Iota.hpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/common/src/KokkosKernels_Iota.hpp b/common/src/KokkosKernels_Iota.hpp index 5b7e24ca24..04851e81c9 100644 --- a/common/src/KokkosKernels_Iota.hpp +++ b/common/src/KokkosKernels_Iota.hpp @@ -55,7 +55,7 @@ class Iota { public: using size_type = SizeType; using value_type = T; - using non_const_value_type = std::remove_const; + using non_const_value_type = std::remove_const_t; using device_type = void; using data_type = const value_type *; diff --git a/common/unit_test/Test_Common_Iota.hpp b/common/unit_test/Test_Common_Iota.hpp index cae207d56b..af3b6502bf 100644 --- a/common/unit_test/Test_Common_Iota.hpp +++ b/common/unit_test/Test_Common_Iota.hpp @@ -74,6 +74,17 @@ void test_iota_rank() { EXPECT_EQ((Iota::rank), 1); } +template +void test_iota_non_const_value_type() { + static_assert( + std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const type provided"); + static_assert( + std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const version of " + "const type provided"); +} + template void test_iota_subview() { // get the 7th and 8th elements of an Iota @@ -98,6 +109,7 @@ void test_iota() { test_is_iota(); test_iota_constructor(); test_iota_rank(); + test_iota_non_const_value_type(); test_iota_subview(); } From c851638702ebaddcedabac0995736aff3fec7c8c Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 10 Apr 2023 13:03:01 -0600 Subject: [PATCH 078/231] ComparableCast: safe signed/unsigned integer comparisons --- common/src/KokkosKernels_ComparableCast.hpp | 99 +++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 common/src/KokkosKernels_ComparableCast.hpp diff --git a/common/src/KokkosKernels_ComparableCast.hpp b/common/src/KokkosKernels_ComparableCast.hpp new file mode 100644 index 0000000000..a0ac4d8c06 --- /dev/null +++ b/common/src/KokkosKernels_ComparableCast.hpp @@ -0,0 +1,99 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_COMPARABLE_CAST_HPP +#define KOKKOSKERNELS_COMPARABLE_CAST_HPP + +namespace KokkosKernels { +namespace Impl { + +/*! \brief cast `a` to a type that can be safely compared with an value of type + T + + When comparing signed and unsigned types of the same size, the signed type + is converted to unsigned which produces strange behavior like int32_t(-1) > + uint32_t(1) This function casts its argument to a type that is safe to use to + compare with T and U. + + Basically this boils down to: + 1. forbidding any comparisons between signed integers and uint64_t, + since there's no reliable signed integer type larger than 64 bits. + 2. Using a type large enough to represent both sides of a comparison + otherwise. + + If T and A are float, use the larger type + Else If T or A are float, use the float type + Else if T xor A are signed, we can have a problem. Choose a signed type at + least: as large as the signed type large enough to represent the unsigned + type Else, choose the larger type + + This function does not protect you from casting an int to a float where that + value is not representable. +*/ +template +constexpr auto comparable_cast(const A &a) { + // both floating point, use the larger type + if constexpr (std::is_floating_point_v && std::is_floating_point_v) { + if constexpr (sizeof(T) >= sizeof(A)) { + return T(a); + } else { + return a; + } + } + // one or the other floating point, use the floating point type + else if constexpr (std::is_floating_point_v) { + return T(a); + } else if constexpr (std::is_floating_point_v) { + return a; + } else { + // exactly one is signed integer, and both are the same size, choose a large + // enough signed type + if constexpr (std::is_signed_v != std::is_signed_v) { + // how wide the signed type would need to be for T and U + constexpr size_t t_width = + std::is_signed_v ? sizeof(T) : 2 * sizeof(T); + constexpr size_t a_width = + std::is_signed_v ? sizeof(A) : 2 * sizeof(A); + + // how wide to compare T and U + constexpr size_t width = std::max(t_width, a_width); + if constexpr (width == 1) { + return int8_t(a); + } else if constexpr (width == 2) { + return int16_t(a); + } else if constexpr (width == 4) { + return int32_t(a); + } else if constexpr (width == 8) { + return int64_t(a); + } else { + static_assert(std::is_same_v, "no safe way to compare types"); + } + } + // both or neither are signedreturn the larger types + else { + if constexpr (sizeof(T) >= sizeof(A)) { + return T(a); + } else { + return a; + } + } + } +} + +} // namespace Impl +} // namespace KokkosKernels + +#endif // KOKKOSKERNELS_COMPARABLE_CAST_HPP \ No newline at end of file From 80adf3894ae4a797c5844cc889e1a6e77ad2e1f3 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 10 Apr 2023 13:03:15 -0600 Subject: [PATCH 079/231] add MergeMatrixDiagonal abstraction Adds an abstraction over the diagonals merge-matrix M of two sorted views A and B, where M[i,] = A[i] > B[j]. A diagonal of such a matrix is the entries running from the bottom-left to the top-right (from large I / small J to small I / large J). MergeMatrixDiagonal presents a 1D-View-Like interface into a specific diagonal of the merge matrix of two views A and B, allowing the caller to retrieve the value of entries along the diagonal. --- common/CMakeLists.txt | 1 + common/impl/KokkosKernels_SafeCompare.hpp | 81 +++ common/src/KokkosKernels_ComparableCast.hpp | 99 ---- sparse/src/KokkosSparse_MergeMatrix.hpp | 199 +++++++ sparse/unit_test/Test_Sparse.hpp | 1 + sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 592 +++++++++++++++++++ 6 files changed, 874 insertions(+), 99 deletions(-) create mode 100644 common/impl/KokkosKernels_SafeCompare.hpp delete mode 100644 common/src/KokkosKernels_ComparableCast.hpp create mode 100644 sparse/src/KokkosSparse_MergeMatrix.hpp create mode 100644 sparse/unit_test/Test_Sparse_MergeMatrix.hpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 88bf237274..b065869296 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,3 +1,4 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/impl) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) diff --git a/common/impl/KokkosKernels_SafeCompare.hpp b/common/impl/KokkosKernels_SafeCompare.hpp new file mode 100644 index 0000000000..494ef45ada --- /dev/null +++ b/common/impl/KokkosKernels_SafeCompare.hpp @@ -0,0 +1,81 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_SAFECOMPARE_HPP +#define KOKKOSKERNELS_SAFECOMPARE_HPP + +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosKernels { +namespace Impl { + +/*! \brief t > u + + When comparing signed and unsigned types of the same size, the signed type + is converted to unsigned which produces strange behavior like int32_t(-1) > + uint32_t(1) This function casts its arguments to types that can represent + the full range of both argument types, before comparing. + + Basically this boils down to: + 1. forbidding any comparisons between signed integers and uint64_t, + since there's no reliable signed integer type larger than 64 bits. + 2. Using a type large enough to represent both sides of a comparison + otherwise. + + If T and A are ints, and T xor U is signed, choose a signed type large + enough to represent all values of both T and U + + This function does not protect you from casting an int to a float where that + value is not representable. +*/ +template +KOKKOS_INLINE_FUNCTION constexpr bool safe_gt(const T &t, const U &u) { + using KT = Kokkos::ArithTraits; + using KU = Kokkos::ArithTraits; + + // both are integer, but only one is signed + if constexpr (KT::is_integer && KU::is_integer && + (KT::is_signed != KU::is_signed)) { + // how wide the signed type would need to be to hold T and U + constexpr size_t t_width = KT::is_signed ? sizeof(T) : 2 * sizeof(T); + constexpr size_t u_width = KU::is_signed ? sizeof(U) : 2 * sizeof(U); + + // compare using the max width + constexpr size_t width = KOKKOSKERNELS_MACRO_MAX(t_width, u_width); + if constexpr (width == 1) { + return int8_t(t) > int8_t(u); + } else if constexpr (width == 2) { + return int16_t(t) > int16_t(u); + } else if constexpr (width == 4) { + return int32_t(t) > int32_t(u); + } else if constexpr (width == 8) { + return int64_t(t) > int64_t(u); + } else { + static_assert(std::is_same_v, "no safe way to compare types"); + } + } else { + // use whatever the default comparison rules are + return t > u; + } + + // CUDA 11.2 issues a spurious missing return warning + return false; +} + +} // namespace Impl +} // namespace KokkosKernels + +#endif // KOKKOSKERNELS_SAFECOMPARE_HPP \ No newline at end of file diff --git a/common/src/KokkosKernels_ComparableCast.hpp b/common/src/KokkosKernels_ComparableCast.hpp deleted file mode 100644 index a0ac4d8c06..0000000000 --- a/common/src/KokkosKernels_ComparableCast.hpp +++ /dev/null @@ -1,99 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSKERNELS_COMPARABLE_CAST_HPP -#define KOKKOSKERNELS_COMPARABLE_CAST_HPP - -namespace KokkosKernels { -namespace Impl { - -/*! \brief cast `a` to a type that can be safely compared with an value of type - T - - When comparing signed and unsigned types of the same size, the signed type - is converted to unsigned which produces strange behavior like int32_t(-1) > - uint32_t(1) This function casts its argument to a type that is safe to use to - compare with T and U. - - Basically this boils down to: - 1. forbidding any comparisons between signed integers and uint64_t, - since there's no reliable signed integer type larger than 64 bits. - 2. Using a type large enough to represent both sides of a comparison - otherwise. - - If T and A are float, use the larger type - Else If T or A are float, use the float type - Else if T xor A are signed, we can have a problem. Choose a signed type at - least: as large as the signed type large enough to represent the unsigned - type Else, choose the larger type - - This function does not protect you from casting an int to a float where that - value is not representable. -*/ -template -constexpr auto comparable_cast(const A &a) { - // both floating point, use the larger type - if constexpr (std::is_floating_point_v && std::is_floating_point_v) { - if constexpr (sizeof(T) >= sizeof(A)) { - return T(a); - } else { - return a; - } - } - // one or the other floating point, use the floating point type - else if constexpr (std::is_floating_point_v) { - return T(a); - } else if constexpr (std::is_floating_point_v) { - return a; - } else { - // exactly one is signed integer, and both are the same size, choose a large - // enough signed type - if constexpr (std::is_signed_v != std::is_signed_v) { - // how wide the signed type would need to be for T and U - constexpr size_t t_width = - std::is_signed_v ? sizeof(T) : 2 * sizeof(T); - constexpr size_t a_width = - std::is_signed_v ? sizeof(A) : 2 * sizeof(A); - - // how wide to compare T and U - constexpr size_t width = std::max(t_width, a_width); - if constexpr (width == 1) { - return int8_t(a); - } else if constexpr (width == 2) { - return int16_t(a); - } else if constexpr (width == 4) { - return int32_t(a); - } else if constexpr (width == 8) { - return int64_t(a); - } else { - static_assert(std::is_same_v, "no safe way to compare types"); - } - } - // both or neither are signedreturn the larger types - else { - if constexpr (sizeof(T) >= sizeof(A)) { - return T(a); - } else { - return a; - } - } - } -} - -} // namespace Impl -} // namespace KokkosKernels - -#endif // KOKKOSKERNELS_COMPARABLE_CAST_HPP \ No newline at end of file diff --git a/sparse/src/KokkosSparse_MergeMatrix.hpp b/sparse/src/KokkosSparse_MergeMatrix.hpp new file mode 100644 index 0000000000..d573a5550f --- /dev/null +++ b/sparse/src/KokkosSparse_MergeMatrix.hpp @@ -0,0 +1,199 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_MERGEMATRIX_HPP +#define KOKKOSSPARSE_MERGEMATRIX_HPP + +#include + +#include "KokkosKernels_Iota.hpp" +#include "KokkosKernels_SafeCompare.hpp" + +/// \file KokkosSparse_MergeMatrix.hpp + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +/*! \class MergeMatrixDiagonal + \brief a view into the entries of the Merge Matrix along a diagonal + + @tparam AView Type of the input view a, must be rank 1 + @tparam BViewLike Type of the view-like object b, must be Kokkos::View or + KokkosKernels::Iota Example merge matrix M of two arrays A (vertical) and B + (horizontal), as seen in Odeh, Green, Mwassi, Shmueli, Birk Merge Path - + Parallel Merging Made Simple 2012 M[i,j] = 1 iff A[i] > B[j] operator(k) + returns A[i] > B[j] at the kth entry of the diagonal + + 3 5 12 22 45 64 69 82 + ------------------------ + | / / + 17 | 1 1 1 0 0 0 0 0 + |/ / + 29 | 1 1 1 1 0 0 0 0 + | / + 35 | 1 1 1 1 0 0 0 0 + | / + 73 | 1 1 1 1 1 1 1 0 + | / + 86 | 1 1 1 1 1 1 1 1 + |/ + 90 | 1 1 1 1 1 1 1 1 + | + 95 | 1 1 1 1 1 1 1 1 + | + 99 | 1 1 1 1 1 1 1 1 + Diagonals are counted from the top-left. + Index into a diagonal from the bottom-left. + Shown on the figure above is the 1st and 5th diagonal + The 0th diagonal D_0 has length 0 + The 1st diagonal D_1 has length 1 + The 5th diagonal D_5 has length 5 + The 9th diagonal D_9 has length 7 + D_1(0) = 1 + D_5(0..3) = 1 + D_5(4) = 0 +*/ +template +class MergeMatrixDiagonal { + public: + static_assert(AView::rank == 1, "MergeMatrixDiagonal AView must be rank 1"); + static_assert(Kokkos::is_view_v || + KokkosKernels::Impl::is_iota_v, + "MergeMatrixDiagonal BViewLike must be Kokkos::View or " + "KokkosKernels::Iota"); + static_assert(BViewLike::rank == 1, + "MergeMatrixDiagonal BViewLike must be rank 1"); + + using execution_space = typename AView::execution_space; + + /** + * Define the types for index and value of each view + */ + using a_index_type = typename AView::size_type; + using b_index_type = typename BViewLike::size_type; + using a_value_type = typename AView::non_const_value_type; + using b_value_type = typename BViewLike::non_const_value_type; + + /*! \struct MatrixPosition + * \brief indices into the a_ and b_ views. + */ + struct MatrixPosition { + a_index_type ai; + b_index_type bi; + }; + using position_type = MatrixPosition; + + // implement bare minimum parts of the view interface + enum { rank = 1 }; + using non_const_value_type = bool; ///< Merge matrix entries are 0 or 1. + + using size_type = + typename std::conditional= + sizeof(typename BViewLike::size_type), + typename AView::size_type, + typename BViewLike::size_type>:: + type; ///< The larger of the two view types' size_types + + /** \brief Initializes the view a and view-like object b and the diagonal. + */ + KOKKOS_INLINE_FUNCTION + MergeMatrixDiagonal(const AView &a, const BViewLike &b, + const size_type diagonal) + : a_(a), b_(b), d_(diagonal) {} + MergeMatrixDiagonal() = default; + + /** + * Computes the position along a and b for a given diagonal di + * + * @param di Current diagonal + * @return The MatrixPosition corresponding to the current diagonal + */ + KOKKOS_INLINE_FUNCTION + position_type position(const size_type &di) const noexcept { + position_type pos; + if (0 == d_) { + pos.ai = 0; + pos.bi = 0; + return pos; + } else { + pos = diag_to_a_b(di); + pos.ai += 1; + return pos; + } + } + + /** + * Compares a[i] > b[j] along the diagonal at entry di + * + * @param di Current diagonal + * @return True if a[i] > b[j], false otherwise + */ + KOKKOS_INLINE_FUNCTION + bool operator()(const size_type di) const { + position_type pos = diag_to_a_b(di); + if (pos.ai >= a_.size()) { + return true; // on the +a side out of matrix bounds is 1 + } else if (pos.bi >= b_.size()) { + return false; // on the +b side out of matrix bounds is 0 + } else { + return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); + } + } + + /** + * Returns the length of the diagonal + * + * @return Length of the diagonal + */ + KOKKOS_INLINE_FUNCTION + size_type size() const noexcept { + if (d_ <= a_.size() && d_ <= b_.size()) { + return d_; + } else if (d_ > a_.size() && d_ > b_.size()) { + // TODO: this returns nonsense if d_ happens to be outside the merge + // matrix + return a_.size() + b_.size() - d_; + } else { + return KOKKOSKERNELS_MACRO_MIN(a_.size(), b_.size()); + } + } + + private: + /** + * Translates an index along the diagonal to indices into a_ and b_ + * + * @param di Current diagonal + * @return The corresponding MatrixPosition with indices into a_ and b_ + */ + KOKKOS_INLINE_FUNCTION + position_type diag_to_a_b(const size_type &di) const noexcept { + position_type res; + res.ai = d_ < a_.size() ? (d_ - 1) - di : a_.size() - 1 - di; + res.bi = d_ < a_.size() ? di : d_ + di - a_.size(); + return res; + } + + AView a_; ///< The a view + BViewLike b_; ///< The b view + size_type d_; ///< diagonal +}; + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_MERGEMATRIX_HPP diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index 0dbf7bc759..e10bac740d 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -27,6 +27,7 @@ #include "Test_Sparse_mdf.hpp" #include "Test_Sparse_findRelOffset.hpp" #include "Test_Sparse_gauss_seidel.hpp" +#include "Test_Sparse_MergeMatrix.hpp" #include "Test_Sparse_replaceSumInto.hpp" #include "Test_Sparse_replaceSumIntoLonger.hpp" #include "Test_Sparse_spadd.hpp" diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp new file mode 100644 index 0000000000..0cddef964f --- /dev/null +++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp @@ -0,0 +1,592 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef TEST_COMMON_MERGE_MATRIX +#define TEST_COMMON_MERGE_MATRIX + +#include +#include + +#include +#include "KokkosKernels_Iota.hpp" +#include "KokkosSparse_MergeMatrix.hpp" + +namespace Test_Sparse_MergeMatrix { + +template +View from_std_vec(const std::string &label, + const std::vector &vec) { + Kokkos::View + uvec(vec.data(), vec.size()); + View result(label, uvec.size()); + Kokkos::deep_copy(result, uvec); + return result; +} + +template +struct CopyMmdToView { + CopyMmdToView(const View &dst, const MMD &src) : dst_(dst), src_(src) {} + + KOKKOS_INLINE_FUNCTION + void operator()(size_t i) const { dst_(i) = src_(i); } + + private: + View dst_; + MMD src_; +}; + +template +void expect_mmd_entries( + const MMD &mmd, + const std::vector &expected) { + using execution_space = typename MMD::execution_space; + using Policy = Kokkos::RangePolicy; + using View = + Kokkos::View; + + // size is as expected + EXPECT_EQ(mmd.size(), expected.size()); + + // values are as expected + View view("mmd-values", mmd.size()); + execution_space space; + Kokkos::parallel_for(Policy(space, 0, mmd.size()), CopyMmdToView(view, mmd)); + auto host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); + space.fence(); + for (size_t i = 0; i < host.size(); ++i) { + EXPECT_EQ(host(i), expected[i]); + } +} + +/*! \brief merge-matrix of two empty views + + Matrix is 0x0. + Only diagonal 0 exists, and it should be size 0. +*/ +template +void view_view_empty_empty() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + + AView a("view-view-empty-empty-a", 0); + BView b("view-view-empty-empty-b", 0); + expect_mmd_entries(MMD(a, b, 0), {}); +} + +/*! \brief merge-matrix of one empty view + + Matrix is Nx0. + N diagonals exist, all length 0 +*/ +template +void view_view_full_empty() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + + size_t aNonzero = 5; + AView a("view-view-full-empty-a", aNonzero); + BView b("view-view-full-empty-b", 0); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + expect_mmd_entries(MMD(a, b, diagonal), {}); + } +} + +/*! \brief merge-matrix of one empty view + + Matrix is 0xN. + N diagonals exist, all length 0 +*/ +template +void view_view_empty_full() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + + AView a("view-view-empty-full-a", 0); + BView b = from_std_vec("view-view-empty-full-b", {0, 1, 2, 3}); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + expect_mmd_entries(MMD(a, b, diagonal), {}); + } +} + +template +std::tuple view_view_case_all_zero() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + AView a = from_std_vec("view-view-case-all-zero-a", {0, 0, 0, 0}); + BView b = from_std_vec("view-view-case-all-zero-b", {0, 1, 2, 3}); + + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_all_one() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 0 0 0 + // A ------- + // 1 | 1 1 1 1 + // 2 | 1 1 1 1 + // 3 | 1 1 1 1 + // 4 | 1 1 1 1 + AView a = from_std_vec("view-view-case-all-one-a", {1, 2, 3, 4}); + BView b = from_std_vec("view-view-case-all-one-b", {0, 0, 0, 0}); + + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_1() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 1 | 1 0 0 0 + // 2 | 1 1 0 0 + // 3 | 1 1 1 0 + // 4 | 1 1 1 1 + AView a = from_std_vec("view-view-case-1-a", {1, 2, 3, 4}); + BView b = from_std_vec("view-view-case-1-b", {0, 1, 2, 3}); + + // diagonal 0: {} + // 1: {1} + // 2: {1,0} + // 3: {1,1,0} + // 4: {1,1,0,0} + // 5: {1,1,0} + // 6: {1,0} + // 7: {1} + + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_2() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 2 2 8 8 8 + // A ----------- + // 1 | 1 0 0 0 0 0 + // 2 | 1 0 0 0 0 0 + // 9 | 1 1 1 1 1 1 + AView a = from_std_vec("view-view-case-2-a", {1, 2, 9}); + BView b = from_std_vec("view-view-case-2-b", {0, 2, 2, 8, 8, 8}); + // 0: {} + // 1: {1} + // 2: {1,0} + // 3: {1,0,0} + // 4: {1,0,0} + // 5: {1,0,0} + // 6: {1,0,0} + // 7: {1,0} + // 8: {1} + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_3() { + using AEntry = typename AView::non_const_value_type; + // M[i,j] = 1 iff A[i] > B[j] + // B 0 2 7 + // A ----- + // -1 | 0 0 0 + // 9 | 1 1 1 + // 9 | 1 1 1 + AView a = from_std_vec("view-view-case-3-a", + {AEntry(-1), AEntry(9), AEntry(9)}); + BView b = from_std_vec("view-view-case-3-b", {0, 2, 7}); + // 0: {} + // 1: {0} + // 2: {1,0} + // 3: {1,1,0} + // 4: {1,1} + // 5: {1} + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_4() { + using BEntry = typename BView::non_const_value_type; + + // M[i,j] = 1 iff A[i] > B[j] + // B -3 -1 7 + // A ------- + // 1 | 1 1 0 + // 6 | 1 1 0 + // 6 | 1 1 0 + AView a = from_std_vec("view-view-case-4-a", {1, 6, 6}); + BView b = + from_std_vec("view-view-case-4-b", {BEntry(-3), BEntry(-1), 7}); + // 0: {} + // 1: {1} + // 2: {1,1} + // 3: {1,1,0} + // 4: {1,0} + // 5: {0} + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_5() { + using AEntry = typename AView::non_const_value_type; + using BEntry = typename BView::non_const_value_type; + + // M[i,j] = 1 iff A[i] > B[j] + // B -2 0 1 + // A ------- + // -3 | 0 0 0 + // -2 | 0 0 0 + // 2 | 1 1 1 + AView a = from_std_vec("view-view-case-5-a", + {AEntry{-3}, AEntry{-2}, AEntry{2}}); + BView b = from_std_vec("view-view-case-5-b", + {BEntry{-2}, BEntry{0}, BEntry{1}}); + // 0: {} + // 1: {0} + // 2: {0,0} + // 3: {0,0,0} + // 4: {1,0} + // 5: {1} + return std::make_tuple(a, b); +} + +/*! \brief merge-matrix of two views + + Matrix is MxN. + M+N-1 diagonals exist. +*/ +template +void view_view_full_full() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using mmd_value_type = typename MMD::non_const_value_type; + + { + auto [a, b] = view_view_case_all_zero(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 0 + expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(0))); + } + } + { + auto [a, b] = view_view_case_all_one(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 0 + expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(1))); + } + } + { + auto [a, b] = view_view_case_1(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 1, 0, 0}); + expect_mmd_entries(MMD(a, b, 5), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 6), {1, 0}); + expect_mmd_entries(MMD(a, b, 7), {1}); + } + { + auto [a, b] = view_view_case_2(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 5), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 6), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 7), {1, 0}); + expect_mmd_entries(MMD(a, b, 8), {1}); + } + if constexpr (std::is_signed_v) { + auto [a, b] = view_view_case_3(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {0}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 1}); + expect_mmd_entries(MMD(a, b, 5), {1}); + } + if constexpr (std::is_signed_v) { + auto [a, b] = view_view_case_4(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 1}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 0}); + expect_mmd_entries(MMD(a, b, 5), {0}); + } + if constexpr (std::is_signed_v && std::is_signed_v) { + auto [a, b] = view_view_case_5(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {0}); + expect_mmd_entries(MMD(a, b, 2), {0, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 0}); + expect_mmd_entries(MMD(a, b, 5), {1}); + } +} + +template +void test_view_view() { + view_view_empty_empty(); + view_view_full_empty(); + view_view_empty_full(); + view_view_full_full(); +} + +/*! \brief merge-matrix of an empty view and empty iota + + Matrix is 0x0. + Only diagonal 0 exists, and it should be size 0. +*/ +template +void view_iota_empty_empty() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + + AView a("view-iota-empty-empty-a", 0); + BView b(0); + EXPECT_EQ(MMD(a, b, 0).size(), 0); +} + +/*! \brief merge-matrix of a full view and empty iota + + Matrix is Nx0. + N diagonals exist, all length 0 +*/ +template +void view_iota_full_empty() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + + size_t aNonzero = 5; + AView a("view-iota-full-empty-a", aNonzero); + BView b(0); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + EXPECT_EQ(MMD(a, b, diagonal).size(), 0); + } +} + +/*! \brief merge-matrix of and empty view and a full iota + + Matrix is 0xN. + N diagonals exist, all length 0 +*/ +template +void view_iota_empty_full() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + + AView a("view-iota-empty-full-a", 0); + BView b(4); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + EXPECT_EQ(MMD(a, b, diagonal).size(), 0); + } +} + +template +std::tuple view_iota_case_all_zero() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + AView a = from_std_vec("view-iota-case-all-zero-a", {0, 0, 0, 0}); + BView b(4); + + return std::make_tuple(a, b); +} + +template +std::tuple view_iota_case_all_one() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 5 | 1 1 1 1 + // 6 | 1 1 1 1 + // 7 | 1 1 1 1 + // 8 | 1 1 1 1 + AView a = from_std_vec("view-iota-case-all-one-a", {5, 6, 7, 8}); + BView b(4); + + return std::make_tuple(a, b); +} + +template +std::tuple view_iota_case_1() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 1 | 1 0 0 0 + // 2 | 1 1 0 0 + // 3 | 1 1 1 0 + // 4 | 1 1 1 1 + AView a = from_std_vec("view-iota-case-1-a", {1, 2, 3, 4}); + BView b(4); + + // diagonal 0: {} + // 1: {1} + // 2: {1,0} + // 3: {1,1,0} + // 4: {1,1,0,0} + // 5: {1,1,0} + // 6: {1,0} + // 7: {1} + + return std::make_tuple(a, b); +} + +/*! \brief merge-matrix of a full view with a full iota + + Matrix is MxN. + M+N-1 diagonals exist. +*/ +template +void view_iota_full_full() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using mmd_value_type = typename MMD::non_const_value_type; + + { + auto [a, b] = view_iota_case_all_zero(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 0 + expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(0))); + } + } + { + auto [a, b] = view_iota_case_all_one(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 1 + expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(1))); + } + } + { + auto [a, b] = view_iota_case_1(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 1, 0, 0}); + expect_mmd_entries(MMD(a, b, 5), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 6), {1, 0}); + expect_mmd_entries(MMD(a, b, 7), {1}); + } +} + +template +void test_view_iota() { + view_iota_empty_empty(); + view_iota_full_empty(); + view_iota_empty_full(); + view_iota_full_full(); +} + +template +void test_rank() { + { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + static_assert(MMD::rank == 1, + "MergeMatrixDiagonal should look like a rank-1 view"); + } + + { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = + KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + static_assert(MMD::rank == 1, + "MergeMatrixDiagonal should look like a rank-1 view"); + } +} + +template +void test_merge_matrix() { + test_rank(); + test_view_view(); + test_view_iota(); +} + +} // namespace Test_Sparse_MergeMatrix + +TEST_F(TestCategory, common_merge_matrix) { + // clang-format off + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + // test some select integer / float combos + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + // no generally safe way to compare all possible values of these types + // Test_Sparse_MergeMatrix::test_merge_matrix(); + // Test_Sparse_MergeMatrix::test_merge_matrix(); + // Test_Sparse_MergeMatrix::test_merge_matrix(); + + // clang-format on +} + +#endif // TEST_COMMON_MERGE_MATRIX From 2391ae09cb84ca87e468455c95145f03b4ef9544 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 10 Apr 2023 14:34:29 -0600 Subject: [PATCH 080/231] Blas: Don't document return values of functions that don't return --- blas/src/KokkosBlas1_mult.hpp | 2 -- blas/src/KokkosBlas1_swap.hpp | 9 +++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 47fa1f536f..f390b3556a 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -40,8 +40,6 @@ namespace KokkosBlas { /// \param alpha [in] The scalar to apply to A. /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. -/// -/// \return Y = gamma * Y + alpha * A * X. template void mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index f91d090cd5..26c529f3b7 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -33,10 +33,9 @@ namespace KokkosBlas { /// \param x [in/out] 1-D View. /// \param y [in/out] 1-D View. /// -/// \return x and y with swapped values, note that this is akin to -/// performing a deep_copy, swapping pointers inside view -/// can only be performed if no aliasing, subviews, etc... -/// exist, which cannot be asserted by this function. +/// Swaps x and y. Note that this is akin to performing a deep_copy, swapping +/// pointers inside view can only be performed if no aliasing, subviews, etc... +/// exist, which cannot be asserted by this function. /// /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking @@ -107,8 +106,6 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { /// \param x [in/out] 1-D View. /// \param y [in/out] 1-D View. /// -/// \return x and y with swapped values. -/// /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking. Note that the kernel will be /// executed on the default stream of the execution_space associted with x. From d928f72c9e87e00f585f04132704a570e2eb23de Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 13 Jul 2023 11:22:55 -0600 Subject: [PATCH 081/231] ParILUT bench: fix unused IS_GPU warning --- perf_test/sparse/KokkosSparse_par_ilut.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp index ef144f2817..52191a47f9 100644 --- a/perf_test/sparse/KokkosSparse_par_ilut.cpp +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -71,9 +71,6 @@ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; using float_t = typename Kokkos::ArithTraits::mag_type; -static constexpr bool IS_GPU = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - /////////////////////////////////////////////////////////////////////////////// void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, const sp_matrix_type& A, int& num_iters) @@ -132,6 +129,9 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, #ifdef USE_GINKGO /////////////////////////////////////////////////////////////////////////////// +static constexpr bool IS_GPU = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + using ginkgo_exec = std::conditional_t; From 1ad8757d82a1387ae0a1ff684f1ee649d691263c Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 27 Jun 2023 14:04:16 -0700 Subject: [PATCH 082/231] Add BsrMatrix SpMV perf test Also adds KokkosSparse::Impl::crs_detect_block_size(), which finds the largest block size in a CrsMatrix that yields only dense blocks. --- perf_test/sparse/CMakeLists.txt | 4 + .../KokkosSparse_spmv_bsr_benchmark.cpp | 460 ++++++++++++++++++ .../KokkosSparse_crs_detect_block_size.hpp | 158 ++++++ 3 files changed, 622 insertions(+) create mode 100644 perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp create mode 100644 sparse/impl/KokkosSparse_crs_detect_block_size.hpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 263f59671a..2039276c79 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -135,4 +135,8 @@ if (KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( sparse_spmv_benchmark SOURCES KokkosSparse_spmv_benchmark.cpp ) + + KOKKOSKERNELS_ADD_BENCHMARK( + sparse_spmv_bsr_benchmark SOURCES KokkosSparse_spmv_bsr_benchmark.cpp + ) endif() diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp new file mode 100644 index 0000000000..933917c1a6 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp @@ -0,0 +1,460 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file KokkosSparse_spmv_bsr_benchmark.cpp + + Read a matrix market file, choose a block size and a number of multivectors, + and compare Bsr SpMV implementations +*/ + +#include +#include + +#include + +/* Some versions of clang that hipcc is basedoff of haven't stabilized + * std::filesystem yet */ +#if defined(KOKKOS_ENABLE_HIP) && __HIPCC__ +#include +namespace fs = std::experimental::filesystem; +#else +#include +namespace fs = std::filesystem; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +#include +#endif + +#include + +#include + +#include + +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_crs_to_bsr_impl.hpp" +#include "KokkosSparse_crs_detect_block_size.hpp" + +using namespace KokkosKernelsBenchmark; + +/* Since benchmarks have to be defined before they are executed, the file IO + for each benchmark needs to be in the execution itself, otherwise every + matrix would have to be resident in memory before any benchmark can run. + + If multiple benchmarks need the same file, it would be read over and over + again. This is especially painful on network file systems, so this executable + has a global cache to store the most recently-read matrix. + + Despite that the matrix is always read with the same precision, we don't + know the Device at this time, so we can't define the value type of the cache + yet. Instead, we'll erase the type, and use a pointer to void. The cache will + be keyed on a combination of the path and the requested type, so we know if + the actual CrsMatrix behind the void pointer matches the requested type or + not +*/ +using Key = std::tuple; +using Val = std::shared_ptr; // type-erased Crs matrix (since we don't + // know the template params) +static Key CACHE_KEY = {"", std::type_index(typeid(void))}; +static Val CACHE_VAL = nullptr; + +// This can be called before Kokkos::finalize to kill the matrix that is living +// in the cache +void drop_cache() { + CACHE_KEY = {"", std::type_index(typeid(void))}; + CACHE_VAL = nullptr; +} + +/// cache repeated reads to \c path +template +Crs cached_read(const fs::path &path) { + // check if the cached matrix is a Crs from path + const Key key(path, std::type_index(typeid(Crs))); + + // if this is not the cached matrix, overwrite the cache + if (CACHE_KEY != key) { + CACHE_KEY = key; + CACHE_VAL = std::make_shared( + KokkosSparse::Impl::read_kokkos_crst_matrix(path.c_str())); + } + + // the Crs type is part of the key, so we know this cast is safe + return *std::static_pointer_cast(CACHE_VAL); +} + +/* Cache a map of path -> matrix block size so that scanning the matrix to + * register the benchmark and then actually running the becnchmark don't both + * need to run the matrix */ +template +size_t detect_block_size(const fs::path &path) { + using ReadScalar = double; + using ReadOrdinal = int64_t; + using ReadOffset = uint64_t; + using Crs = KokkosSparse::CrsMatrix; + + static std::map cache; + + if (0 == cache.count(path)) { + std::cerr << "read " << path << "...\n"; + const Crs crs = cached_read(path); + size_t detectedSize = KokkosSparse::Impl::detect_block_size(crs); + std::cerr << "detected block size = " << detectedSize << "\n"; + cache[path] = detectedSize; + } + return cache.at(path); +} + +// a bool by a different name, to make its purpose clear +class DieOnError { + public: + DieOnError(const bool &val) : val_(val) {} + + operator bool() const { return val_; } + + private: + bool val_; +}; + +// a bool by a different name, to make its purpose clear +class SkipOnError { + public: + SkipOnError(const bool &val) : val_(val) {} + operator bool() const { return val_; } + + private: + bool val_; +}; + +// Test that y_act is close to y_exp. +// This needs the matrix, alpha, and beta to compute the error tolerance +// properly +template +void check_correctness(benchmark::State &state, const View &y_exp, + const View &y_act, const Matrix &crs, const Alpha &alpha, + const Beta &beta, const DieOnError &die, + const SkipOnError &skip) { + using execution_space = typename View::execution_space; + using scalar_type = typename View::non_const_value_type; + using AT = Kokkos::ArithTraits; + using mag_type = typename AT::mag_type; + using ATM = Kokkos::ArithTraits; + + // max value in A + mag_type maxA = 0; + Kokkos::parallel_reduce( + "maxA", Kokkos::RangePolicy(0, crs.nnz()), + KOKKOS_LAMBDA(const int &i, mag_type &lmax) { + mag_type v = AT::abs(crs.values(i)); + lmax = lmax > v ? lmax : v; + }, + maxA); + + double eps = AT::epsilon(); + const double max_val = + AT::abs(beta * 1.0 + crs.numCols() * alpha * maxA * 1.0); + + auto h_exp = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y_exp); + auto h_act = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y_act); + + size_t err = 0; + std::vector> errIdx; + for (size_t i = 0; i < h_exp.extent(0); ++i) { + for (size_t k = 0; k < h_exp.extent(1); ++k) { + const mag_type error = ATM::abs(h_exp(i, k) - h_act(i, k)); + if (error > eps * max_val) { + ++err; + errIdx.push_back({i, k}); + } + } + } + if (err > 0) { + size_t errLimit = 100; // how many errors to print + std::cerr << "first " << errLimit << " errors...\n"; + std::cerr << "i\tk\texp\tact" << std::endl; + std::cerr << "-\t-\t---\t---" << std::endl; + for (auto [i, k] : errIdx) { + std::cerr << i << "\t" << k << "\t" << h_exp(i, k) << "\t" << h_act(i, k) + << std::endl; + if (0 == --errLimit) { + break; + } + } + std::cerr << __FILE__ << ":" << __LINE__ << ": ERROR: correctness failed " + << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << ": threshold was " + << eps * max_val << std::endl; + + if (die) { + exit(EXIT_FAILURE); + } else if (skip) { + state.SkipWithError("correctness check failed"); + } + } +} + +// Wrapper to create a common interface for all SpMVs to benchmark +struct SpmvDefault { + template + static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, + const XView &x, const Beta &beta, const YView &y) { + return KokkosSparse::spmv(mode, alpha, crs, x, beta, y); + } + + static std::string name() { return "default"; } +}; + +// Wrapper to create a common interface for all SpMVs to benchmark +struct SpmvNative { + template + static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, + const XView &x, const Beta &beta, const YView &y) { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "native"); + return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + } + + static std::string name() { return "native"; } +}; + +template +void run(benchmark::State &state, const Bsr &bsr, const size_t k) { + using execution_space = typename Bsr::execution_space; + using memory_space = typename Bsr::memory_space; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using size_type = typename Bsr::non_const_size_type; + + // multivector should be layoutleft for CPU, makes + // slices of a single vector contiguous + using view_t = Kokkos::View; + + state.counters["nnz"] = bsr.nnz() * bsr.blockDim() * bsr.blockDim(); + state.counters["num_rows"] = bsr.numRows() * bsr.blockDim(); + state.counters["block_size"] = bsr.blockDim(); + state.counters["num_vecs"] = k; + + view_t y_init("y_init", bsr.numRows() * bsr.blockDim(), k); + view_t y_exp("ye", bsr.numRows() * bsr.blockDim(), k); + view_t y_act("ya", bsr.numRows() * bsr.blockDim(), k); + view_t x("x", bsr.numCols() * bsr.blockDim(), k); + + Kokkos::Random_XorShift64_Pool random_pool(12345); + fill_random(y_init, random_pool, 0.0, 1.0); + fill_random(x, random_pool, 0.0, 1.0); + scalar_type alpha{1.17}; + scalar_type beta{-0.3}; + + Kokkos::deep_copy(y_act, y_init); + Kokkos::deep_copy(y_exp, y_init); + + const char *mode = KokkosSparse::NoTranspose; + + // test the SpMV against whatever the default is + KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp); + Kokkos::fence(); + Spmv::spmv(mode, alpha, bsr, x, beta, y_act); + Kokkos::fence(); + + check_correctness(state, y_exp, y_act, bsr, alpha, beta, DieOnError(false), + SkipOnError(true)); + + Kokkos::fence(); + for (auto _ : state) { + Spmv::spmv(mode, alpha, bsr, x, beta, y_exp); + Kokkos::fence(); + } + + const size_t bytesPerSpmv = + bsr.nnz() * bsr.blockDim() * bsr.blockDim() * + sizeof(scalar_type) // A values + + bsr.nnz() * sizeof(ordinal_type) // A col indices + + (bsr.numRows() + 1) * sizeof(size_type) // A row-map + + 2 * bsr.numRows() * bsr.blockDim() * k * + sizeof(scalar_type) // load / store y + + bsr.numCols() * bsr.blockDim() * k * sizeof(scalar_type) // load x + ; + + state.SetBytesProcessed(bytesPerSpmv * state.iterations()); +} + +template +void read_expand_run(benchmark::State &state, const fs::path &path, + const size_t blockSize, const size_t k) { + using device_type = typename Bsr::device_type; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + + // read Crs into host memory + using Crs = + KokkosSparse::CrsMatrix; + + const Crs crs = cached_read(path); + Bsr bsr; + try { + bsr = KokkosSparse::Impl::expand_crs_to_bsr(crs, blockSize); + } catch (std::exception &e) { + state.SkipWithError(e.what()); + return; + } + + run(state, bsr, k); +} + +template +void read_convert_run(benchmark::State &state, const fs::path &path, + const size_t blockSize, const size_t k) { + using device_type = typename Bsr::device_type; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + + using Crs = + KokkosSparse::CrsMatrix; + + const Crs crs = cached_read(path); + Bsr bsr; + try { + bsr = KokkosSparse::Impl::blocked_crs_to_bsr(crs, blockSize); + } catch (std::exception &e) { + state.SkipWithError(e.what()); + return; + } + + run(state, bsr, k); +} + +template +void register_expand_type(const fs::path &path) { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + std::vector ks = {1, 3}; + for (size_t bs : {4, 7, 10, 16}) { // block sizes + for (size_t k : ks) { // multivector sizes + std::string name = + std::string("MatrixMarketExpanded") + "/" + std::string(path.stem()) + + "/" + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + std::to_string(bs) + "/" + + std::to_string(k) + "/" + Spmv::name() + "/" + Device::name(); + benchmark::RegisterBenchmark(name.c_str(), read_expand_run, + path, bs, k) + ->UseRealTime(); + } + } +} + +template +void register_convert_type(const fs::path &path, size_t bs) { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + std::vector ks = {1, 3}; + + for (size_t k : ks) { // multivector sizes + std::string name = + std::string("MatrixMarketConvert") + "/" + std::string(path.stem()) + + "/" + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + std::to_string(bs) + "/" + + std::to_string(k) + "/" + Spmv::name() + "/" + Device::name(); + benchmark::RegisterBenchmark(name.c_str(), read_convert_run, + path, bs, k) + ->UseRealTime(); + } +} + +template +void register_converts(const fs::path &path, const size_t bs) { + std::cerr << "benchmarks will use detected blocksize\n"; + // clang-format off + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + // clang-format on +} + +template +void register_expands(const fs::path &path) { + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); +} + +template +void register_path(const fs::path &path) { + size_t detectedSize; + try { + detectedSize = detect_block_size(path); + } catch (const std::exception &e) { + std::cerr << "ERROR while reading: " << e.what() << "\n" + << "skipping!\n"; + return; + } + + /* If a block size can be detected, just use that block size without + expanding the matrix. + Otherwise, expand the matrix to some arbitrary block sizes to test BSR + */ + if (detectedSize != 1) { + std::cerr << "benchmarks will use detected size\n"; + register_converts(path, detectedSize); + } else { + std::cerr << "benchmarks will expand each non-zero into a larger block\n"; + register_expands(path); + } +} + +int main(int argc, char **argv) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMicrosecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + for (int i = 1; i < argc; ++i) { +#if defined(KOKKOS_ENABLE_CUDA) + register_path(argv[i]); +#endif +#if defined(KOKKOS_ENABLE_HIP) + register_path(argv[i]); +#endif +#if defined(KOKKOS_ENABLE_SERIAL) + register_path(argv[i]); +#endif + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + drop_cache(); + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/sparse/impl/KokkosSparse_crs_detect_block_size.hpp b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp new file mode 100644 index 0000000000..42d4eddf89 --- /dev/null +++ b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp @@ -0,0 +1,158 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP +#define KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP + +#include + +#include +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_Utils.hpp" + +/*! \file KokkosSparse_crs_detect_block_size.hpp + + \brief A utility function for detecting the block size in a CrsMatrix. Not + for performance-sensitive use. +*/ + +namespace KokkosSparse { +namespace Impl { + +/** + * \class BlockPopulations + * \brief A class to store population counts of blocks in a CrsMatrix + */ +class BlockPopulations { + public: + /** + * \brief Constructor for BlockPopulations + * \param sz The block size + */ + BlockPopulations(size_t sz) : sz_(sz) {} + + /** + * \brief Add a point to the corresponding block + * \param r The row index of the point + * \param c The column index of the point + */ + void add(size_t r, size_t c) { + auto key = std::make_pair(r / sz_, c / sz_); + auto it = blocks_.find(key); + if (it == blocks_.end()) { + blocks_.insert(std::make_pair(key, 1)); + } else { + ++(it->second); + } + } + + /** + * \brief Check if all blocks are dense + * \return True if all blocks have a count equal to the block size squared + */ + bool all_dense() const { + for (const auto &kv : blocks_) { + if (kv.second < sz_ * sz_) { + return false; + } + } + return true; + } + + private: + std::map, size_t> + blocks_; /**< A map of block coordinates to their population counts */ + size_t sz_; /**< The block size */ +}; + +/** + * @brief Detects the largest block size that yields only dense blocks in a + CrsMatrix + * + * @tparam Crs The type of the CRS matrix. + * @param crs The CRS matrix to detect the block size for. + * @return The largest block size that results in completely dense blocks + The smallest valid block size is 1 + Since blocks must be dense, sqrt(nnz), num rows, num cols, and min nnz/row + among non-empty rows are all easy upper bounds of the block size Block sizes + are tested from 1 to the minimum of the above The matrix dimensions must divide + evenly into a trial block size (otherwise a block would not be full) + Furthermore, if a block size of N is not dense, any multiple of N will also not + be dense, and can be skipped. This is because blocks of 2N contain blocks of N, + at least one of which is already known not to be dense. In practice, this ends + up testing only small composite factors and all prime factors up to the upper + bound +*/ +template +size_t detect_block_size(const Crs &crs) { + using ordinal_type = typename Crs::ordinal_type; + + // copy matrix data to host + auto rs = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.row_map); + auto cs = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.entries); + + // upper bound is minimum of sqrt(nnz), numRows, numCols, + // and smallest non-empty row + size_t upperBound = std::sqrt(double(crs.nnz())); + upperBound = std::min(upperBound, size_t(crs.numRows())); + upperBound = std::min(upperBound, size_t(crs.numCols())); + for (size_t i = 1; i < rs.size(); ++i) { + size_t rowLen = rs(i) - rs(i - 1); + if (rowLen > 0) { + upperBound = std::min(upperBound, rowLen); + } + } + + // trial blocks sizes that didn't work out + std::vector rejectedSizes; + + size_t largestBlockSize = 1; // always a valid block size + for (size_t trialSize = 2; trialSize <= upperBound; ++trialSize) { + // trial size must be factor of rows / cols + if ((crs.numRows() % trialSize) || (crs.numCols() % trialSize)) { + continue; + } + + // trial size must not be a multiple of previously-rejected size + if (std::any_of(rejectedSizes.begin(), rejectedSizes.end(), + [&](size_t f) { return trialSize % f == 0; })) { + continue; + } + + // count the population of all blocks + BlockPopulations pops(trialSize); + for (ordinal_type row = 0; row < crs.numRows(); ++row) { + for (size_t ci = rs(row); ci < rs(row + 1); ++ci) { + ordinal_type col = cs(ci); + pops.add(row, col); + } + } + + // if all blocks are dense, this is the largest one so far + if (pops.all_dense()) { + largestBlockSize = trialSize; + } else { + rejectedSizes.push_back(trialSize); + } + } + return largestBlockSize; +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP \ No newline at end of file From cde06624b45ed9eeb59868b0bd140182d2be1146 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 10 Jul 2023 07:29:49 -0600 Subject: [PATCH 083/231] ODE: adding adaptivity test for RK methods A few variables were not updated in a logical way leading to very bad and illogical performance of the adaptive algorithms. This should now be fixed. Future work can work into reporting the number of time steps actually taken and also some discussion around the inner sub-stepping should happen. A new unit-test is added to test adaptivity although a criteria to measure the efficacy of the adaptive algorithm needs to be added. --- ode/impl/KokkosODE_RungeKutta_impl.hpp | 52 +++++++----- ode/unit_test/Test_ODE_RK.hpp | 106 +++++++++++++++++++++++-- 2 files changed, 128 insertions(+), 30 deletions(-) diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp index 791093c8db..8bc9662c5a 100644 --- a/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -97,20 +97,38 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { constexpr scalar_type error_threshold = 1; bool adapt = params.adaptivity; + bool dt_was_reduced; if (std::is_same_v>) { adapt = false; } - scalar_type dt = (t_end - t_start) / params.max_steps; - scalar_type t = t_start; - for (int stepIdx = 0; (stepIdx < params.max_steps) && (t < t_end); + // Set current time and initial time step + scalar_type t_now = t_start; + scalar_type dt = (t_end - t_start) / params.max_steps; + + // Loop over time steps to integrate ODE + for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); ++stepIdx) { - // Set err to be arbitrarily larger than our threshold of 1 + // Check that the step attempted is not putting + // the solution past t_end, otherwise shrink dt + if (t_end < t_now + dt) { + dt = t_end - t_now; + } + + // Set error to be arbitrarily larger than our threshold + // so we can pass the initial check. Also reset + // dt_was_reduced to false for current time step. scalar_type error = 2 * error_threshold; scalar_type tol = 0; + dt_was_reduced = false; + + // Take tentative steps until the requested error + // is met. This of course only works for adaptive + // solvers, for fix time steps we simply do not + // compute and check what error of the current step while (error_threshold < error) { // Take a step of Runge-Kutta integrator - RKStep(ode, table, adapt, t, dt, y0, y, temp, k_vecs); + RKStep(ode, table, adapt, t_now, dt, y0, y, temp, k_vecs); // Compute the largest error and decide on // the size of the next time step to take. @@ -131,44 +149,34 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( // is rejected. if (error > 1) { dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + dt_was_reduced = true; } + if (dt < params.min_step_size) return Experimental::ode_solver_status::MIN_SIZE; } } - // Update y0 to stage the next time step. + // Update time and initial condition for next time step + t_now += dt; for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y0(eqIdx) = y(eqIdx); } - if (t < t_end) { - // We may want to print the evolution of the solution over time - // with something similar to the statement below but will need - // to generalize it and make it GPU friendly first, also it - // should be guarded when not doing a debug run, this prints - // a lot... - // std::cout << " step " << stepIdx << " t=" << t << ", y={"; - // for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { - // std::cout << y(eqIdx) << " "; - // } - // std::cout << "}" << std::endl; - if (adapt) { + if (t_now < t_end) { + if (adapt && !dt_was_reduced && error < 0.5) { // Compute new time increment dt = dt * Kokkos::min( 10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); - } else { - // Use same increment - t += dt; } } else { return Experimental::ode_solver_status::SUCCESS; } } - if (t < t_end) return Experimental::ode_solver_status::MAX_STEP; + if (t_now < t_end) return Experimental::ode_solver_status::MAX_STEP; return Experimental::ode_solver_status::SUCCESS; } // RKSolve diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 1e851108f3..1d971cc8d3 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -463,19 +463,109 @@ void test_convergence_rate() { } } // test_convergence_rate +template +void test_adaptivity() { + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + + duho my_oscillator(1, 1, 4); + const int neqs = my_oscillator.neqs; + + vec_type y("solution", neqs), f("function", neqs); + auto y_h = Kokkos::create_mirror(y); + y_h(0) = 1; + y_h(1) = 0; + Kokkos::deep_copy(y, y_h); + + constexpr double tstart = 0, tend = 1.024; + constexpr int maxSteps = 512, numSteps = 128; + constexpr double absTol = 1e-14, relTol = 1e-8, minStepSize = 0.001; + vec_type y_new("y new", neqs), y_old("y old", neqs); + + // Since y_old_h will be reused to set initial conditions + // for each method tested we do not want to use + // create_mirror_view which would not do a copy + // when y_old is in HostSpace. + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; + y_old_h(1) = 0; + + // First compute analytical solution as reference + // and to evaluate the error from each RK method. + vec_type y_ref("reference value", neqs); + auto y_ref_h = Kokkos::create_mirror(y_ref); + { + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::RangePolicy my_policy(0, 1); + solution_wrapper wrapper(my_oscillator, tend, y_old, y_ref); + Kokkos::parallel_for(my_policy, wrapper); + + Kokkos::deep_copy(y_ref_h, y_ref); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nAnalytical solution" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" + << std::endl; +#endif + } + + vec_type tmp("tmp vector", neqs); + mv_type kstack( + "k stack", neqs, + KokkosODE::Experimental::RungeKutta::num_stages()); + + Kokkos::RangePolicy my_policy(0, 1); + KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, + minStepSize); + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + RKSolve_wrapper + solve_wrapper(my_oscillator, params, tstart, tend, y_old, y_new, tmp, + kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "Results: " << std::endl; + std::cout << " y_ref={ "; + for (int idx = 0; idx < y_ref_h.extent_int(0); ++idx) { + std::cout << y_ref_h(idx) << " "; + } + std::cout << "}" << std::endl; + std::cout << " y_new={ "; + for (int idx = 0; idx < y_new_h.extent_int(0); ++idx) { + std::cout << y_new_h(idx) << " "; + } + std::cout << "}" << std::endl; + std::cout << " error={ "; + double error; +#endif + + for (int idx = 0; idx < y_new_h.extent_int(0); ++idx) { +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + error = + Kokkos::abs(y_new_h(idx) - y_ref_h(idx)) / Kokkos::abs(y_ref_h(idx)); + std::cout << error << " "; +#endif + EXPECT_NEAR_KK_REL(y_new_h(idx), y_ref_h(idx), 1e-7); + } +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "}" << std::endl; +#endif + +} // test_adaptivity + } // namespace Test -int test_RK() { - Test::test_RK(); - return 1; -} +void test_RK() { Test::test_RK(); } + +void test_RK_conv_rate() { Test::test_convergence_rate(); } -int test_RK_conv_rate() { - Test::test_convergence_rate(); - return 1; -} +void test_RK_adaptivity() { Test::test_adaptivity(); } #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, RKSolve_serial) { test_RK(); } TEST_F(TestCategory, RK_conv_rate) { test_RK_conv_rate(); } +TEST_F(TestCategory, RK_adaptivity) { test_RK_adaptivity(); } #endif From 394f1ab6e8e2c4b1f36c62296e89a799d16156de Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 14 Jul 2023 14:40:40 -0600 Subject: [PATCH 084/231] Don't write to source tree during build --- perf_test/batched/sparse/CMakeLists.txt | 15 ++++++++++++--- perf_test/batched/sparse/scripts/run_CG.sh | 3 --- perf_test/batched/sparse/scripts/run_CG.sh.in | 1 + perf_test/batched/sparse/scripts/run_GMRES.sh | 3 --- perf_test/batched/sparse/scripts/run_GMRES.sh.in | 1 + perf_test/batched/sparse/scripts/run_SPMV.sh | 3 --- perf_test/batched/sparse/scripts/run_SPMV.sh.in | 1 + 7 files changed, 15 insertions(+), 12 deletions(-) delete mode 100755 perf_test/batched/sparse/scripts/run_CG.sh create mode 100755 perf_test/batched/sparse/scripts/run_CG.sh.in delete mode 100755 perf_test/batched/sparse/scripts/run_GMRES.sh create mode 100755 perf_test/batched/sparse/scripts/run_GMRES.sh.in delete mode 100755 perf_test/batched/sparse/scripts/run_SPMV.sh create mode 100755 perf_test/batched/sparse/scripts/run_SPMV.sh.in diff --git a/perf_test/batched/sparse/CMakeLists.txt b/perf_test/batched/sparse/CMakeLists.txt index 76a25d9938..b4f3c31f31 100644 --- a/perf_test/batched/sparse/CMakeLists.txt +++ b/perf_test/batched/sparse/CMakeLists.txt @@ -3,6 +3,15 @@ ADD_SUBDIRECTORY(cusolver) ADD_SUBDIRECTORY(GMRES) ADD_SUBDIRECTORY(SPMV) -FILE(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/scripts/binary_dir.txt -"${CMAKE_CURRENT_BINARY_DIR}" -) \ No newline at end of file +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_CG.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/scripts/run_CG.sh +) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_GMRES.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/scripts/run_GMRES.sh +) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_SPMV.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/scripts/run_SPMV.sh +) diff --git a/perf_test/batched/sparse/scripts/run_CG.sh b/perf_test/batched/sparse/scripts/run_CG.sh deleted file mode 100755 index fc740b0a77..0000000000 --- a/perf_test/batched/sparse/scripts/run_CG.sh +++ /dev/null @@ -1,3 +0,0 @@ -exe_path=$(head -n 1 "binary_dir.txt") - -${exe_path}/CG/KokkosBatched_Test_CG -A ../data/A.mm -B ../data/B.mm -X ../output/X_CG -timers ../output/timers_CG -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_CG.sh.in b/perf_test/batched/sparse/scripts/run_CG.sh.in new file mode 100755 index 0000000000..d3b45fc5b6 --- /dev/null +++ b/perf_test/batched/sparse/scripts/run_CG.sh.in @@ -0,0 +1 @@ +@CMAKE_CURRENT_BINARY_DIR@/CG/KokkosBatched_Test_CG -A ../data/A.mm -B ../data/B.mm -X ../output/X_CG -timers ../output/timers_CG -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_GMRES.sh b/perf_test/batched/sparse/scripts/run_GMRES.sh deleted file mode 100755 index e26ab2aa15..0000000000 --- a/perf_test/batched/sparse/scripts/run_GMRES.sh +++ /dev/null @@ -1,3 +0,0 @@ -exe_path=$(head -n 1 "binary_dir.txt") - -${exe_path}/GMRES/KokkosBatched_Test_GMRES -A ../data/A.mm -B ../data/B.mm -X ../output/X_GMRES -timers ../output/timers_GMRES -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_GMRES.sh.in b/perf_test/batched/sparse/scripts/run_GMRES.sh.in new file mode 100755 index 0000000000..b2e9e4174f --- /dev/null +++ b/perf_test/batched/sparse/scripts/run_GMRES.sh.in @@ -0,0 +1 @@ +@CMAKE_CURRENT_BINARY_DIR@/GMRES/KokkosBatched_Test_GMRES -A ../data/A.mm -B ../data/B.mm -X ../output/X_GMRES -timers ../output/timers_GMRES -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_SPMV.sh b/perf_test/batched/sparse/scripts/run_SPMV.sh deleted file mode 100755 index d4edd993aa..0000000000 --- a/perf_test/batched/sparse/scripts/run_SPMV.sh +++ /dev/null @@ -1,3 +0,0 @@ -exe_path=$(head -n 1 "binary_dir.txt") - -${exe_path}/SPMV/KokkosBatched_Test_SPMV -A ../data/A.mm -B ../data/B.mm -X ../output/X_SPMV -timers ../output/timers_SPMV -n1 10 -n2 100 -team_size -1 -implementation 3 -l -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_SPMV.sh.in b/perf_test/batched/sparse/scripts/run_SPMV.sh.in new file mode 100755 index 0000000000..2c9fabe547 --- /dev/null +++ b/perf_test/batched/sparse/scripts/run_SPMV.sh.in @@ -0,0 +1 @@ +@CMAKE_CURRENT_BINARY_DIR@/SPMV/KokkosBatched_Test_SPMV -A ../data/A.mm -B ../data/B.mm -X ../output/X_SPMV -timers ../output/timers_SPMV -n1 10 -n2 100 -team_size -1 -implementation 3 -l -vector_length 8 -N_team 8 \ No newline at end of file From 825fff31d78f13fd0650c85dc33699320df30e3e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 18 Jul 2023 14:19:28 -0600 Subject: [PATCH 085/231] sparse/src: Add execution space inst member to GS handle --- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 412985df72..3cc85b5bbc 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -84,6 +84,8 @@ class GaussSeidelHandle { nnz_lno_persistent_work_host_view_t; // Host view type protected: + HandleExecSpace execution_space; + GSAlgorithm algorithm_type; nnz_lno_persistent_work_host_view_t color_xadj; @@ -101,7 +103,8 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : algorithm_type(gs), + : execution_space(HandleExecSpace()), + algorithm_type(gs), color_xadj(), color_adj(), numColors(0), @@ -127,6 +130,10 @@ class GaussSeidelHandle { bool is_numeric_called() const { return this->called_numeric; } // setters + void set_execution_space(const HandleExecSpace exec_space) { + this->execution_space = exec_space; + } + void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; From 846aa30d2db8b23d285b6a087f5effd286a62fb9 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 18 Jul 2023 14:50:50 -0600 Subject: [PATCH 086/231] Add missing include --- sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp index 0cddef964f..25da2d1278 100644 --- a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp +++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp @@ -17,6 +17,8 @@ #ifndef TEST_COMMON_MERGE_MATRIX #define TEST_COMMON_MERGE_MATRIX +#include + #include #include From ec975834ef63a2299214b01026fe2f435775a80e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 18 Jul 2023 15:32:56 -0600 Subject: [PATCH 087/231] Update GS point symbolic and friends for streams --- common/src/KokkosKernels_SimpleUtils.hpp | 6 +- common/src/KokkosKernels_Utils.hpp | 54 +++++++------ .../impl/KokkosSparse_gauss_seidel_impl.hpp | 81 +++++++++++-------- sparse/src/KokkosSparse_Utils.hpp | 7 +- .../src/KokkosSparse_gauss_seidel_handle.hpp | 8 +- 5 files changed, 91 insertions(+), 65 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index a271695246..0c9e82773a 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -151,10 +151,10 @@ inline void kk_exclusive_parallel_prefix_sum( template void kk_inclusive_parallel_prefix_sum( typename forward_array_type::value_type num_elements, - forward_array_type arr) { - typedef Kokkos::RangePolicy my_exec_space; + forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 2a4b749f92..c8cc284b73 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -457,9 +457,9 @@ struct Fill_Reverse_Map { template void inclusive_parallel_prefix_sum( typename forward_array_type::value_type num_elements, - forward_array_type arr) { + forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { kk_inclusive_parallel_prefix_sum( - num_elements, arr); + num_elements, arr, my_exec_space); } template @@ -668,14 +668,15 @@ void create_reverse_map( const forward_array_type &forward_map, // vertex to colors reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj) { // colros to vertex adj + reverse_array_type &reverse_map_adj, + MyExecSpace my_exec_space = MyExecSpace()) { // colros to vertex adj typedef typename reverse_array_type::value_type lno_t; typedef typename forward_array_type::value_type reverse_lno_t; const lno_t MINIMUM_TO_ATOMIC = 64; - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1); reverse_map_adj = reverse_array_type( @@ -699,24 +700,27 @@ void create_reverse_map( forward_map, tmp_color_xadj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapScaleInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); inclusive_parallel_prefix_sum( tmp_reverse_size + 1, tmp_color_xadj); - MyExecSpace().fence(); + my_exec_space.fence(); - Kokkos::parallel_for("KokkosKernels::Common::StridedCopy", - my_exec_space(0, num_reverse_elements + 1), - StridedCopy( - tmp_color_xadj, reverse_map_xadj, scale_size)); - MyExecSpace().fence(); + Kokkos::parallel_for( + "KokkosKernels::Common::StridedCopy", + range_policy_t(my_exec_space, 0, num_reverse_elements + 1), + StridedCopy( + tmp_color_xadj, reverse_map_xadj, scale_size)); + my_exec_space.fence(); Fill_Reverse_Scale_Map frm( forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } else // atomic implementation. { @@ -728,20 +732,22 @@ void create_reverse_map( forward_map, reverse_map_xadj); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); // print_1Dview(reverse_map_xadj); inclusive_parallel_prefix_sum( num_reverse_elements + 1, reverse_map_xadj); - MyExecSpace().fence(); + my_exec_space.fence(); Kokkos::deep_copy(tmp_color_xadj, reverse_map_xadj); - MyExecSpace().fence(); + my_exec_space.fence(); Fill_Reverse_Map frm( forward_map, tmp_color_xadj, reverse_map_adj); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } } @@ -1253,10 +1259,12 @@ template void kk_view_reduce_max_row_size(const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, - size_type &max_row_size) { - typedef Kokkos::RangePolicy my_exec_space; + size_type &max_row_size, + MyExecSpace my_exec_space = MyExecSpace()) { + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxRowSize", my_exec_space(0, num_rows), + "KokkosKernels::Common::ViewReduceMaxRowSize", + range_policy_t(my_exec_space, 0, num_rows), ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), max_row_size); } diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index e4cfb4b047..4830011dfc 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -84,7 +84,7 @@ class PointGaussSeidel { typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; - typedef Kokkos::RangePolicy range_pol; + typedef Kokkos::RangePolicy range_policy_t; typedef typename HandleType::GraphColoringHandleType::color_view_t color_view_t; typedef typename HandleType::GraphColoringHandleType::color_t color_t; @@ -825,6 +825,7 @@ class PointGaussSeidel { void initialize_symbolic() { auto gsHandle = get_gs_handle(); const size_type longRowThreshold = gsHandle->get_long_row_threshold(); + const MyExecSpace my_exec_space = gsHandle->get_execution_space(); // Validate settings if (gsHandle->get_block_size() > 1 && longRowThreshold > 0) @@ -838,6 +839,7 @@ class PointGaussSeidel { #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif + // TODO: Pass my_exec_space into KokkosGraph kernels typename HandleType::GraphColoringHandleType::color_view_t colors; color_t numColors; { @@ -886,7 +888,8 @@ class PointGaussSeidel { for (int i = 0; i < num_rows; ++i) { h_colors(i) = i + 1; } - Kokkos::deep_copy(colors, h_colors); + Kokkos::deep_copy(my_exec_space, colors, h_colors); + my_exec_space.fence(); #endif nnz_lno_persistent_work_view_t color_xadj; nnz_lno_persistent_work_view_t color_adj; @@ -896,10 +899,10 @@ class PointGaussSeidel { KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, numColors, colors, color_xadj, color_adj); + num_rows, numColors, colors, color_xadj, color_adj, my_exec_space); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -909,7 +912,7 @@ class PointGaussSeidel { Kokkos::deep_copy(h_color_xadj, color_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "DEEP_COPY:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -930,21 +933,24 @@ class PointGaussSeidel { max_row_length_per_color); int sortLongRowsTeamSize = 1; { - team_policy_t temp(1, 1); + team_policy_t temp(my_exec_space, 1, 1); sortLongRowsTeamSize = temp.team_size_recommended( sortIntoLongRowsFunctor, Kokkos::ParallelReduceTag()); } - Kokkos::parallel_reduce(team_policy_t(numColors, sortLongRowsTeamSize), - sortIntoLongRowsFunctor, - Kokkos::Max(mostLongRowsInColor)); + Kokkos::parallel_reduce( + team_policy_t(my_exec_space, numColors, sortLongRowsTeamSize), + sortIntoLongRowsFunctor, Kokkos::Max(mostLongRowsInColor)); auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color); - Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color); + Kokkos::deep_copy(my_exec_space, host_long_rows_per_color, + long_rows_per_color); + my_exec_space.fence(); gsHandle->set_long_rows_per_color(host_long_rows_per_color); auto host_max_row_length_per_color = Kokkos::create_mirror_view(max_row_length_per_color); - Kokkos::deep_copy(host_max_row_length_per_color, + Kokkos::deep_copy(my_exec_space, host_max_row_length_per_color, max_row_length_per_color); + my_exec_space.fence(); gsHandle->set_max_row_length_per_color(host_max_row_length_per_color); scalar_persistent_work_view_t long_row_x( Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"), @@ -953,10 +959,11 @@ class PointGaussSeidel { } else { // Just sort rows by ID. KokkosSparse::sort_crs_graph(color_xadj, color_adj); + decltype(color_adj)>(my_exec_space, + color_xadj, color_adj); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SORT_TIME:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -968,29 +975,29 @@ class PointGaussSeidel { Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), create_permuted_xadj(color_adj, xadj, permuted_xadj, old_to_new_map)); // std::cout << "create_permuted_xadj" << std::endl; #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl; timer.reset(); #endif KokkosKernels::Impl::inclusive_parallel_prefix_sum< - row_lno_persistent_work_view_t, MyExecSpace>(num_rows + 1, - permuted_xadj); + row_lno_persistent_work_view_t, MyExecSpace>( + num_rows + 1, permuted_xadj, my_exec_space); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl; timer.reset(); #endif Kokkos::parallel_for("KokkosSparse::PointGaussSeidel::fill_matrix_symbolic", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_symbolic(num_rows, color_adj, xadj, adj, // adj_vals, permuted_xadj, permuted_adj, @@ -998,7 +1005,7 @@ class PointGaussSeidel { old_to_new_map)); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -1013,7 +1020,7 @@ class PointGaussSeidel { size_type max_row_size = 0; KokkosKernels::Impl::kk_view_reduce_max_row_size( num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, - max_row_size); + max_row_size, my_exec_space); nnz_lno_t brows = permuted_xadj.extent(0) - 1; size_type bnnz = permuted_adj.extent(0) * block_size * block_size; @@ -1079,15 +1086,17 @@ class PointGaussSeidel { size_type num_large_rows = 0; KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( - brows, permuted_xadj, num_values_in_l1, num_large_rows); + brows, permuted_xadj, num_values_in_l1, num_large_rows, + my_exec_space); num_big_rows = KOKKOSKERNELS_MACRO_MIN( num_large_rows, - (size_type)(MyExecSpace().concurrency() / suggested_vector_size)); + (size_type)(my_exec_space.concurrency() / suggested_vector_size)); // std::cout << "num_big_rows:" << num_big_rows << std::endl; if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. + // TODO: Need to account for number of streams here? size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< @@ -1396,7 +1405,7 @@ class PointGaussSeidel { block_matrix_size)); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::fill_matrix_numeric", - range_pol(0, num_rows), + range_policy_t(0, num_rows), fill_matrix_numeric(color_adj, xadj, // adj, adj_vals, newxadj_, @@ -1427,7 +1436,7 @@ class PointGaussSeidel { } else { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::get_matrix_diagonals", - range_pol(0, num_rows), gmd); + range_policy_t(0, num_rows), gmd); } } else { @@ -1591,8 +1600,8 @@ class PointGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); + // Kokkos::parallel_for( range_policy_t(0,nr), + // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, @@ -1673,8 +1682,8 @@ class PointGaussSeidel { apply_backward); } - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); + // Kokkos::parallel_for( range_policy_t(0,nr), + // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, @@ -1823,7 +1832,8 @@ class PointGaussSeidel { gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + range_policy_t(color_index_end - numLongRows, + color_index_end), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); @@ -1872,10 +1882,10 @@ class PointGaussSeidel { nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows; if (numRegularRows) { - Kokkos::parallel_for( - labelShort, - range_pol(color_index_begin, color_index_end - numLongRows), - gs); + Kokkos::parallel_for(labelShort, + range_policy_t(color_index_begin, + color_index_end - numLongRows), + gs); } if (numLongRows) { gs._color_set_begin = color_index_end - numLongRows; @@ -1896,7 +1906,8 @@ class PointGaussSeidel { gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + range_policy_t(color_index_end - numLongRows, + color_index_end), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 4039b6f5a7..88258356ef 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -1887,11 +1887,12 @@ template void kk_reduce_numrows_larger_than_threshold( size_t num_elements, view_type view_to_reduce, typename view_type::const_value_type threshold, - typename view_type::non_const_value_type &sum_reduction) { - typedef Kokkos::RangePolicy my_exec_space; + typename view_type::non_const_value_type &sum_reduction, + MyExecSpace my_exec_space = MyExecSpace()) { + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), ReduceLargerRowCount(view_to_reduce, threshold), sum_reduction); } diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 3cc85b5bbc..6e57a23ee2 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -116,6 +116,8 @@ class GaussSeidelHandle { virtual ~GaussSeidelHandle() = default; // getters + HandleExecSpace get_execution_space() const { return this->execution_space; } + GSAlgorithm get_algorithm_type() const { return this->algorithm_type; } nnz_lno_persistent_work_host_view_t get_color_xadj() const { @@ -131,7 +133,11 @@ class GaussSeidelHandle { // setters void set_execution_space(const HandleExecSpace exec_space) { - this->execution_space = exec_space; + static bool is_exec_space_set = false; + if (!is_exec_space_set) { + this->execution_space = exec_space; + is_exec_space_set = true; + } } void set_algorithm_type(const GSAlgorithm sgs_algo) { From 304a0e9de2b57443d90573d38e59a5fd94fc9bf6 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 10 Jul 2023 07:29:49 -0600 Subject: [PATCH 088/231] ODE: changing layout of temp mem in RK algorithms The change should give better data access on CPU --- ode/impl/KokkosODE_RungeKutta_impl.hpp | 8 ++++---- ode/unit_test/Test_ODE_RK.hpp | 21 +++++++++++++-------- ode/unit_test/Test_ODE_RK_chem.hpp | 4 ++-- perf_test/ode/KokkosODE_RK.cpp | 8 ++++---- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp index 8bc9662c5a..f5fe39d65d 100644 --- a/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -48,7 +48,7 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, // now accumulate y_new += dt*b_i*k_i { // we always start with y_new += dt*b_0*k0 - auto k0 = Kokkos::subview(k_vecs, Kokkos::ALL, 0); + auto k0 = Kokkos::subview(k_vecs, 0, Kokkos::ALL); ode.evaluate_function(t + table.c[0] * dt, dt, y_old, k0); for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { y_new(eqIdx) += dt * table.b[0] * k0(eqIdx); @@ -65,12 +65,12 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, for (int idx = 0; idx < stageIdx; ++idx) { for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { temp(eqIdx) += - table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(eqIdx, idx); + table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(idx, eqIdx); } } KokkosBlas::SerialScale::invoke(dt, temp); KokkosBlas::serial_axpy(1, y_old, temp); - auto k = Kokkos::subview(k_vecs, Kokkos::ALL, stageIdx); + auto k = Kokkos::subview(k_vecs, stageIdx, Kokkos::ALL); ode.evaluate_function(t + table.c[stageIdx] * dt, dt, temp, k); for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { y_new(eqIdx) += dt * table.b[stageIdx] * k(eqIdx); @@ -82,7 +82,7 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { temp(eqIdx) = 0; for (int stageIdx = 0; stageIdx < nstages; ++stageIdx) { - temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(eqIdx, stageIdx); + temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(stageIdx, eqIdx); } } } diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 1d971cc8d3..59f0e3457a 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -130,7 +130,7 @@ void test_method(const std::string label, ode_type& my_ode, KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", my_ode.neqs); - mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); Kokkos::RangePolicy my_policy(0, 1); RKSolve_wrapper @@ -152,11 +152,11 @@ void test_method(const std::string label, ode_type& my_ode, (void)label; #endif for (int stageIdx = 0; stageIdx < solver_type::num_stages(); ++stageIdx) { - EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(0, stageIdx), 1e-8); - EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(1, stageIdx), 1e-8); + EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(stageIdx, 0), 1e-8); + EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(stageIdx, 1), 1e-8); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - std::cout << " k" << stageIdx << "={" << kstack_h(0, stageIdx) << ", " - << kstack_h(1, stageIdx) << "}" << std::endl; + std::cout << " k" << stageIdx << "={" << kstack_h(stageIdx, 0) << ", " + << kstack_h(stageIdx, 1) << "}" << std::endl; #endif } EXPECT_NEAR_KK(sol(0), y_new_h(0), 1e-8); @@ -322,7 +322,7 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, using solver_type = KokkosODE::Experimental::RungeKutta; vec_type tmp("tmp vector", my_ode.neqs); - mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); vec_type y_new("solution", my_ode.neqs); vec_type y_old("intial conditions", my_ode.neqs); @@ -511,8 +511,8 @@ void test_adaptivity() { vec_type tmp("tmp vector", neqs); mv_type kstack( - "k stack", neqs, - KokkosODE::Experimental::RungeKutta::num_stages()); + "k stack", + KokkosODE::Experimental::RungeKutta::num_stages(), neqs); Kokkos::RangePolicy my_policy(0, 1); KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, @@ -564,6 +564,11 @@ void test_RK_conv_rate() { Test::test_convergence_rate(); } void test_RK_adaptivity() { Test::test_adaptivity(); } +int test_RK_adaptivity() { + Test::test_adaptivity(); + return 1; +} + #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, RKSolve_serial) { test_RK(); } TEST_F(TestCategory, RK_conv_rate) { test_RK_conv_rate(); } diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 2adc202ddc..5abdd41d00 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -103,7 +103,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); - mv_type kstack("k stack", neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), neqs); // Set initial conditions vec_type y_new("solution", neqs); @@ -144,7 +144,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); - mv_type kstack("k stack", neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), neqs); // Set initial conditions vec_type y_new("solution", neqs); diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp index e9dc3f2f8e..d45eec48c4 100644 --- a/perf_test/ode/KokkosODE_RK.cpp +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -132,8 +132,8 @@ struct RKSolve_wrapper { auto local_y_new = Kokkos::subview(y_new, Kokkos::pair(2 * idx, 2 * idx + 1)); auto local_tmp = Kokkos::subview(tmp, Kokkos::pair(2 * idx, 2 * idx + 1)); - auto local_kstack = Kokkos::subview( - kstack, Kokkos::pair(2 * idx, 2 * idx + 1), Kokkos::ALL()); + auto local_kstack = Kokkos::subview(kstack, Kokkos::ALL(), + Kokkos::pair(2 * idx, 2 * idx + 1)); // Run Runge-Kutta time integrator KokkosODE::Impl::RKSolve( @@ -178,7 +178,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { table_type table; ode_params params(num_steps); vec_type tmp("tmp vector", neqs * num_odes); - mv_type kstack("k stack", neqs * num_odes, table.nstages); + mv_type kstack("k stack", table.nstages, neqs * num_odes); // Set initial conditions vec_type y_new("solution", neqs * num_odes); @@ -230,7 +230,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { table_type table; ode_params params(num_steps); vec_type tmp("tmp vector", neqs * num_odes); - mv_type kstack("k stack", neqs * num_odes, table.nstages); + mv_type kstack("k stack", table.nstages, neqs * num_odes); // Set initial conditions vec_type y_new("solution", neqs * num_odes); From 64311965cc8653bf445cf6f76b4e893213003d30 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 19 Jul 2023 12:55:25 -0600 Subject: [PATCH 089/231] ODE: fix unnecessary test overload This appeared after a rebase, unfortunate... --- ode/unit_test/Test_ODE_RK.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 59f0e3457a..2e6df4fd81 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -564,11 +564,6 @@ void test_RK_conv_rate() { Test::test_convergence_rate(); } void test_RK_adaptivity() { Test::test_adaptivity(); } -int test_RK_adaptivity() { - Test::test_adaptivity(); - return 1; -} - #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, RKSolve_serial) { test_RK(); } TEST_F(TestCategory, RK_conv_rate) { test_RK_conv_rate(); } From 52324c3766f043557c9b54bd2637634e77096390 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 19 Jul 2023 13:46:07 -0600 Subject: [PATCH 090/231] add missing headers, std::vector -> std::vector<...> --- sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 22 +++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp index 25da2d1278..9fc533ed9c 100644 --- a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp +++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp @@ -14,15 +14,17 @@ // //@HEADER -#ifndef TEST_COMMON_MERGE_MATRIX -#define TEST_COMMON_MERGE_MATRIX +#ifndef TEST_COMMON_MERGE_MATRIX_HPP +#define TEST_COMMON_MERGE_MATRIX_HPP +#include +#include +#include #include #include #include -#include #include "KokkosKernels_Iota.hpp" #include "KokkosSparse_MergeMatrix.hpp" @@ -293,7 +295,8 @@ void view_view_full_full() { for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { MMD mmd(a, b, diagonal); // every matrix entry on this diagonal is 0 - expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(0))); + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(0))); } } { @@ -301,7 +304,8 @@ void view_view_full_full() { for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { MMD mmd(a, b, diagonal); // every matrix entry on this diagonal is 0 - expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(1))); + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(1))); } } { @@ -494,7 +498,8 @@ void view_iota_full_full() { for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { MMD mmd(a, b, diagonal); // every matrix entry on this diagonal is 0 - expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(0))); + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(0))); } } { @@ -502,7 +507,8 @@ void view_iota_full_full() { for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { MMD mmd(a, b, diagonal); // every matrix entry on this diagonal is 1 - expect_mmd_entries(mmd, std::vector(mmd.size(), mmd_value_type(1))); + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(1))); } } { @@ -591,4 +597,4 @@ TEST_F(TestCategory, common_merge_matrix) { // clang-format on } -#endif // TEST_COMMON_MERGE_MATRIX +#endif // TEST_COMMON_MERGE_MATRIX_HPP From 86d8371fdb3d528095a86635b9add53a36bb516e Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 27 Jun 2023 14:04:16 -0700 Subject: [PATCH 091/231] Improve performance of the native BsrMatrix SpMV, especially for single-vector cases. * Adds a new `"v4.2"` BsrMatrix SpMV implementation for non-transpose mode. * It is the default (when TPLs are disabled or not supported) on the GPU for non-transpose mode * The old implementation is retained for all other modes * old implementation may be requested explicitly with `controls.setParameter("algorithm", "v4.1")` * Adds explicit invocation of old "4.1" impl to `KokkosKernels_sparse_spmv_bsr_benchmark` * When TPLs are enabled, the new implementation may be requested anyway with `controls.setParameter("algorithm", "v4.2")` * simplify `KokkosKernels::Impl::always_false_v` * Add `template class with_unmanaged` which provides a `type` alias reproducing `View` with `Kokkos::Unmanaged` added to its memory traits * Add `KokkosKernels::Impl::with_unmanaged_t` as an alias for `typename with_unamanged::type` * Add `template auto KokkosKernels::Impl:make_unmanaged(const View &v)` which constructs a `with_unmanaged_t` from v * Add `` to `KokkosKernels_Error.hpp` * Add `DieOnError` and `SkipOnError` wrapped `bool`s to give names to boolean function arguments * Link `KokkosKernels_sparse_spmv_bsr_benchmark` against `stdc++fs` for rocm 5.2 * More aggressive block size filtering in `KokkosSparse_csr_detect_block_size.hpp` * Removes a useless warning from `Controls::getParameter` since what happens when a parameter is unset was made explicit in https://github.com/kokkos/kokkos-kernels/commit/be87154a2f83f25c269eb3ce2bcca0b82356a8c5 * `BsrMatrix` constructor throws when combination of nnz, rows, and columns don't make sense * Change `BsrMatrix::block_layout` to `BsrMatrix::block_layout_type` for consistency * Adds `BsrMatrix::unmanaged_block` to return an unmanged view to a 2D block of values * Adds `BsrMatrix::unmanaged_block_const` to return a const unmanged view to a 2D block of values --- .../KokkosKernels_AlwaysFalse.hpp | 21 +-- common/impl/KokkosKernels_ViewUtils.hpp | 59 +++++++ common/src/KokkosKernels_Error.hpp | 1 + perf_test/Benchmark_Utils.hpp | 45 ++++++ perf_test/sparse/CMakeLists.txt | 6 + .../KokkosSparse_spmv_bsr_benchmark.cpp | 90 ++++++----- .../KokkosSparse_crs_detect_block_size.hpp | 26 ++-- sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp | 16 +- .../KokkosSparse_spmv_bsrmatrix_impl_v42.hpp | 144 ++++++++++++++++++ .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 103 ++++++++++--- sparse/src/KokkosKernels_Controls.hpp | 2 - sparse/src/KokkosSparse_BsrMatrix.hpp | 51 +++++-- sparse/src/KokkosSparse_spmv.hpp | 12 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 2 +- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 7 +- 15 files changed, 477 insertions(+), 108 deletions(-) rename common/{src => impl}/KokkosKernels_AlwaysFalse.hpp (63%) create mode 100644 common/impl/KokkosKernels_ViewUtils.hpp create mode 100644 perf_test/Benchmark_Utils.hpp create mode 100644 sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp diff --git a/common/src/KokkosKernels_AlwaysFalse.hpp b/common/impl/KokkosKernels_AlwaysFalse.hpp similarity index 63% rename from common/src/KokkosKernels_AlwaysFalse.hpp rename to common/impl/KokkosKernels_AlwaysFalse.hpp index 36f4572d29..12acf4a524 100644 --- a/common/src/KokkosKernels_AlwaysFalse.hpp +++ b/common/impl/KokkosKernels_AlwaysFalse.hpp @@ -17,23 +17,12 @@ #ifndef KOKKOSKERNELS_ALWAYSFALSE_HPP #define KOKKOSKERNELS_ALWAYSFALSE_HPP -#include +namespace KokkosKernels::Impl { -/*! \file KokkosKernels_AlwaysFalse.hpp - \brief A convenience type to be used in a static_assert that should always - fail -*/ +// for use in static asserts +template +inline constexpr bool always_false_v = false; -namespace KokkosKernels { -namespace Impl { - -template -using always_false = std::false_type; - -template -inline constexpr bool always_false_v = always_false::value; - -} // namespace Impl -} // namespace KokkosKernels +} // namespace KokkosKernels::Impl #endif // KOKKOSKERNELS_ALWAYSFALSE_HPP diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp new file mode 100644 index 0000000000..2ae8fb609d --- /dev/null +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -0,0 +1,59 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_VIEWUTILS_HPP +#define KOKKOSKERNELS_VIEWUTILS_HPP +#include "Kokkos_Core.hpp" + +namespace KokkosKernels::Impl { + +/*! \brief Yields a type that is View with Kokkos::Unmanaged added to the memory + * traits + */ +template +class with_unmanaged { + using data_type = typename View::data_type; + using layout_type = typename View::array_layout; + using memory_space = typename View::memory_space; + + using orig_traits = typename View::memory_traits; + static constexpr unsigned new_traits = + orig_traits::impl_value | Kokkos::Unmanaged; + + public: + using type = Kokkos::View >; +}; + +/*! \brief A type that is View with Kokkos::Unmanaged added to the memory traits + + \tparam View the type to add Kokkos::Unmanaged to + */ +template +using with_unmanaged_t = typename with_unmanaged::type; + +/*! \brief Returns an unmanaged version of v + + \tparam View the type of the input view v + */ +template +auto make_unmanaged(const View &v) { + return typename with_unmanaged::type(v); +} + +} // namespace KokkosKernels::Impl + +#endif diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 4d732a8437..52aa6d88da 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -18,6 +18,7 @@ #define KOKKOSKERNELS_ERROR_HPP #include +#include namespace KokkosKernels { namespace Impl { diff --git a/perf_test/Benchmark_Utils.hpp b/perf_test/Benchmark_Utils.hpp new file mode 100644 index 0000000000..8f34182f41 --- /dev/null +++ b/perf_test/Benchmark_Utils.hpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ + +#ifndef KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP +#define KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP + +namespace KokkosKernelsBenchmark { + +class WrappedBool { + public: + WrappedBool(const bool &val) : val_(val) {} + + operator bool() const { return val_; } + + protected: + bool val_; +}; + +class DieOnError : public WrappedBool { + public: + DieOnError(const bool &val) : WrappedBool(val) {} +}; +class SkipOnError : public WrappedBool { + public: + SkipOnError(const bool &val) : WrappedBool(val) {} +}; + +} // namespace KokkosKernelsBenchmark + +#endif // KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP \ No newline at end of file diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 2039276c79..8a994b4122 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -139,4 +139,10 @@ if (KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( sparse_spmv_bsr_benchmark SOURCES KokkosSparse_spmv_bsr_benchmark.cpp ) + + # hipcc 5.2 has an underlying clang that has the std::filesystem + # in an experimental namespace and a different library + if (Kokkos_CXX_COMPILER_ID STREQUAL HIPCC AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 5.3) + target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs) + endif() endif() diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp index 933917c1a6..770b09cfb1 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp @@ -45,6 +45,7 @@ namespace fs = std::filesystem; #include +#include "Benchmark_Utils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_spmv.hpp" @@ -121,27 +122,6 @@ size_t detect_block_size(const fs::path &path) { return cache.at(path); } -// a bool by a different name, to make its purpose clear -class DieOnError { - public: - DieOnError(const bool &val) : val_(val) {} - - operator bool() const { return val_; } - - private: - bool val_; -}; - -// a bool by a different name, to make its purpose clear -class SkipOnError { - public: - SkipOnError(const bool &val) : val_(val) {} - operator bool() const { return val_; } - - private: - bool val_; -}; - // Test that y_act is close to y_exp. // This needs the matrix, alpha, and beta to compute the error tolerance // properly @@ -235,6 +215,20 @@ struct SpmvNative { static std::string name() { return "native"; } }; +// Wrapper to create a common interface for all SpMVs to benchmark +struct SpmvV41 { + template + static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, + const XView &x, const Beta &beta, const YView &y) { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "v4.1"); + return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + } + + static std::string name() { return "v4.1"; } +}; + template void run(benchmark::State &state, const Bsr &bsr, const size_t k) { using execution_space = typename Bsr::execution_space; @@ -269,10 +263,10 @@ void run(benchmark::State &state, const Bsr &bsr, const size_t k) { const char *mode = KokkosSparse::NoTranspose; // test the SpMV against whatever the default is - KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp); - Kokkos::fence(); Spmv::spmv(mode, alpha, bsr, x, beta, y_act); Kokkos::fence(); + KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp); + Kokkos::fence(); check_correctness(state, y_exp, y_act, bsr, alpha, beta, DieOnError(false), SkipOnError(true)); @@ -299,7 +293,6 @@ void run(benchmark::State &state, const Bsr &bsr, const size_t k) { template void read_expand_run(benchmark::State &state, const fs::path &path, const size_t blockSize, const size_t k) { - using device_type = typename Bsr::device_type; using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; @@ -322,7 +315,6 @@ void read_expand_run(benchmark::State &state, const fs::path &path, template void read_convert_run(benchmark::State &state, const fs::path &path, const size_t blockSize, const size_t k) { - using device_type = typename Bsr::device_type; using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; @@ -386,27 +378,53 @@ template void register_converts(const fs::path &path, const size_t bs) { std::cerr << "benchmarks will use detected blocksize\n"; // clang-format off - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + // clang-format on } template void register_expands(const fs::path &path) { - register_expand_type(path); - register_expand_type(path); + std::cerr << "benchmarks will expand each non-zero into a larger block\n"; + // clang-format off register_expand_type(path); register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); register_expand_type(path); + register_expand_type(path); + register_expand_type(path); register_expand_type(path); + register_expand_type(path); + // clang-format on } template @@ -425,10 +443,8 @@ void register_path(const fs::path &path) { Otherwise, expand the matrix to some arbitrary block sizes to test BSR */ if (detectedSize != 1) { - std::cerr << "benchmarks will use detected size\n"; register_converts(path, detectedSize); } else { - std::cerr << "benchmarks will expand each non-zero into a larger block\n"; register_expands(path); } } diff --git a/sparse/impl/KokkosSparse_crs_detect_block_size.hpp b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp index 42d4eddf89..418f2a74cc 100644 --- a/sparse/impl/KokkosSparse_crs_detect_block_size.hpp +++ b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp @@ -28,8 +28,7 @@ for performance-sensitive use. */ -namespace KokkosSparse { -namespace Impl { +namespace KokkosSparse::Impl { /** * \class BlockPopulations @@ -86,14 +85,14 @@ class BlockPopulations { * @return The largest block size that results in completely dense blocks The smallest valid block size is 1 Since blocks must be dense, sqrt(nnz), num rows, num cols, and min nnz/row - among non-empty rows are all easy upper bounds of the block size Block sizes - are tested from 1 to the minimum of the above The matrix dimensions must divide - evenly into a trial block size (otherwise a block would not be full) - Furthermore, if a block size of N is not dense, any multiple of N will also not - be dense, and can be skipped. This is because blocks of 2N contain blocks of N, - at least one of which is already known not to be dense. In practice, this ends - up testing only small composite factors and all prime factors up to the upper - bound + among non-empty rows are all easy upper bounds of the block size. + Block sizes are tested from 1 to the minimum of the above. + The matrix dimensions must divide evenly into a trial block size (otherwise a + block would not be full). Furthermore, if a block size of N is not dense, any + multiple of N will also not be dense, and can be skipped. This is because + blocks of 2N contain blocks of N, at least one of which is already known not to + be dense. In practice, this ends up testing only small composite factors and + all prime factors up to the upper bound. */ template size_t detect_block_size(const Crs &crs) { @@ -124,12 +123,14 @@ size_t detect_block_size(const Crs &crs) { for (size_t trialSize = 2; trialSize <= upperBound; ++trialSize) { // trial size must be factor of rows / cols if ((crs.numRows() % trialSize) || (crs.numCols() % trialSize)) { + rejectedSizes.push_back(trialSize); continue; } // trial size must not be a multiple of previously-rejected size if (std::any_of(rejectedSizes.begin(), rejectedSizes.end(), [&](size_t f) { return trialSize % f == 0; })) { + rejectedSizes.push_back(trialSize); continue; } @@ -152,7 +153,6 @@ size_t detect_block_size(const Crs &crs) { return largestBlockSize; } -} // namespace Impl -} // namespace KokkosSparse +} // namespace KokkosSparse::Impl -#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP \ No newline at end of file +#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp index 8e4c187b99..7f1ff2171e 100644 --- a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp +++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp @@ -23,7 +23,21 @@ Bsr expand_crs_to_bsr(const Crs &crs, size_t blockSize) { using crs_row_map_type = typename Crs::row_map_type; using bsr_row_map_type = Kokkos::View; + bsr_device_type>; // need non-const version + + using bsr_size_type = typename Bsr::non_const_size_type; + + { + size_t nnz = crs.nnz() * blockSize * blockSize; + if (nnz > size_t(Kokkos::ArithTraits::max())) { + std::stringstream ss; + ss << "expanding " << crs.nnz() + << " non-zeros of CrsMatrix into blocks of " << blockSize + << " would overflow size_type of requested BsrMatrix " + << Kokkos::ArithTraits::name(); + throw std::runtime_error(ss.str()); + } + } // construct the Bsr row map bsr_row_map_type bsrRowMap("bsrRowMap", crs.graph.row_map.size()); diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp new file mode 100644 index 0000000000..9c5858a307 --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp @@ -0,0 +1,144 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP +#define KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP + +#include + +#include + +namespace KokkosSparse { +namespace Impl { + +/* One thread for each entry in the product multivector + + Each thread accumulates the partial products for its entry, and writes it + out. +*/ +template +class BsrSpmvV42NonTrans { + Alpha alpha_; + AMatrix a_; + XVector x_; + Beta beta_; + YVector y_; + + public: + BsrSpmvV42NonTrans(const Alpha &alpha, const AMatrix &a, const XVector &x, + const Beta &beta, const YVector &y) + : alpha_(alpha), a_(a), x_(x), beta_(beta), y_(y) {} + + template + KOKKOS_INLINE_FUNCTION void impl(const size_t k) const { + using a_ordinal_type = typename AMatrix::non_const_ordinal_type; + using a_size_type = typename AMatrix::non_const_size_type; + using y_value_type = typename YVector::non_const_value_type; + using const_block_type = typename AMatrix::const_block_type; + + const a_ordinal_type irhs = k / y_.extent(0); + const a_ordinal_type row = k % y_.extent(0); + + // scale by beta + if (0 == beta_) { + y_(row, irhs) = 0; // convert NaN to 0 + } else if (1 != beta_) { + y_(row, irhs) *= beta_; + } + + // for non-zero template instantiations, + // constant propagation should optimize divmod + a_ordinal_type blocksz; + if constexpr (0 == BLOCK_SIZE) { + blocksz = a_.blockDim(); + } else { + blocksz = BLOCK_SIZE; + } + + if (0 != alpha_) { + const a_ordinal_type blockRow = row / blocksz; + const a_ordinal_type lclrow = row % blocksz; + y_value_type accum = 0; + const a_size_type j_begin = a_.graph.row_map(blockRow); + const a_size_type j_end = a_.graph.row_map(blockRow + 1); + for (a_size_type j = j_begin; j < j_end; ++j) { + const_block_type b = a_.unmanaged_block_const(j); + const a_ordinal_type blockcol = a_.graph.entries(j); + const a_ordinal_type x_start = blockcol * blocksz; + + const auto x_lcl = Kokkos::subview( + x_, Kokkos::make_pair(x_start, x_start + blocksz), irhs); + for (a_ordinal_type i = 0; i < blocksz; ++i) { + accum += b(lclrow, i) * x_lcl(i); + } + } + y_(row, irhs) += alpha_ * accum; + } + } + + KOKKOS_INLINE_FUNCTION void operator()(const size_t k) const { + if (false) { + } + // clang-format off + else if ( 1 == a_.blockDim()) { impl< 1>(k); } + else if ( 2 == a_.blockDim()) { impl< 2>(k); } + else if ( 3 == a_.blockDim()) { impl< 3>(k); } + else if ( 4 == a_.blockDim()) { impl< 4>(k); } + else if ( 5 == a_.blockDim()) { impl< 5>(k); } + else if ( 6 == a_.blockDim()) { impl< 6>(k); } + else if ( 7 == a_.blockDim()) { impl< 7>(k); } + else if ( 8 == a_.blockDim()) { impl< 8>(k); } + else if ( 9 == a_.blockDim()) { impl< 9>(k); } + else if (10 == a_.blockDim()) { impl<10>(k); } + else if (11 == a_.blockDim()) { impl<11>(k); } + // clang-format on + else { + impl<0>(k); + } + } +}; + +template +void apply_v42(const Alpha &alpha, const AMatrix &a, const XVector &x, + const Beta &beta, const YVector &y) { + using execution_space = typename YVector::execution_space; + + Kokkos::RangePolicy policy(0, y.size()); + if constexpr (YVector::rank == 1) { + // Implementation expects a 2D view, so create an unmanaged 2D view + // with extent 1 in the second dimension + using Y2D = KokkosKernels::Impl::with_unmanaged_t>; + using X2D = KokkosKernels::Impl::with_unmanaged_t>; + const Y2D yu(y.data(), y.extent(0), 1); + const X2D xu(x.data(), x.extent(0), 1); + BsrSpmvV42NonTrans op(alpha, a, xu, beta, yu); + Kokkos::parallel_for(policy, op); + } else { + BsrSpmvV42NonTrans op(alpha, a, x, beta, y); + Kokkos::parallel_for(policy, op); + } +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 678aaaa0c5..69ff744e9d 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -25,6 +25,7 @@ #include "KokkosKernels_Error.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include +#include "KokkosSparse_spmv_bsrmatrix_impl_v42.hpp" #endif namespace KokkosSparse { @@ -136,6 +137,11 @@ struct SPMV_MV_BSRMATRIX { // actual implementations to be compiled #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +// these should all be different +constexpr inline const char *ALG_V41 = "v4.1"; +constexpr inline const char *ALG_V42 = "v4.2"; +constexpr inline const char *ALG_TC = "experimental_bsr_tc"; + template struct SPMV_BSRMATRIX() || + controls.getParameter("algorithm") == ALG_V42) { + if (modeIsNoTrans) { + ::KokkosSparse::Impl::apply_v42(alpha, A, X, beta, Y); + return; + } + } + + // fall back to V41 all else fails + if (modeIsNoTrans || modeIsConjugate) { return Bsr::spMatVec_no_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } else if ((mode[0] == Transpose[0]) || - (mode[0] == ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == ConjugateTranspose[0]); + modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { return Bsr::spMatVec_transpose(controls, alpha, A, X, beta, Y, - useConjugate); + modeIsConjugateTrans); + } + + { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " "; + ss << "Internal logic error: no applicable BsrMatrix SpMV implementation " + ". Please report this"; + throw std::runtime_error(ss.str()); } } }; @@ -194,7 +231,7 @@ struct SPMV_MV_BSRMATRIX::is_complex) method = Method::Fallback; @@ -289,17 +326,49 @@ struct SPMV_MV_BSRMATRIX() || + controls.getParameter("algorithm") == ALG_V42) { + if (modeIsNoTrans) { + ::KokkosSparse::Impl::apply_v42(alpha, A, X, beta, Y); + return; + } + } - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - bool useConjugate = (mode[0] == Conjugate[0]); + // use V41 as the ultimate fallback + if (modeIsNoTrans || modeIsConjugate) { return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } else if ((mode[0] == Transpose[0]) || - (mode[0] == ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == ConjugateTranspose[0]); + modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y, - useConjugate); + modeIsConjugateTrans); + } + + { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " "; + ss << "Internal logic error: no applicable BsrMatrix SpMV implementation " + ". Please report this"; + throw std::runtime_error(ss.str()); } } }; diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp index c600dad89a..0bb8f79ff0 100644 --- a/sparse/src/KokkosKernels_Controls.hpp +++ b/sparse/src/KokkosKernels_Controls.hpp @@ -64,8 +64,6 @@ class Controls { const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); if (kernel_parameters.end() == search) { - std::cerr << "WARNING: Controls::getParameter for name \"" << name - << "\" was unset" << std::endl; return orUnset; } else { return search->second; diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index a366245a86..b36143c14b 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -393,10 +393,18 @@ class BsrMatrix { //! Nonconst version of the type of the entries in the sparse matrix. typedef typename values_type::non_const_value_type non_const_value_type; - // block values are actually a 1-D view, however they are implicitly - // arranged in LayoutRight, e.g. consecutive entries in the values view - // are consecutive entries within a row inside a block - using block_layout = Kokkos::LayoutRight; + //! block values are actually a 1-D view, however they are implicitly + //! arranged in LayoutRight, e.g. consecutive entries in the values view + //! are consecutive entries within a row inside a block + using block_layout_type = Kokkos::LayoutRight; + + //! Type returned by \c unmanaged_block + using block_type = Kokkos::View; + + //! Type returned by \c unmanaged_block_const + using const_block_type = Kokkos::View; /// \name Storage of the actual sparsity structure and values. /// @@ -480,15 +488,12 @@ class BsrMatrix { /// \param cols [in] The column indices. cols[k] is the column /// index of val[k]. /// \param blockdim [in] The block size of the constructed BsrMatrix. - /// \param pad [in] If true, pad the sparse matrix's storage with - /// zeros in order to improve cache alignment and / or - /// vectorization. + /// \param pad [in] Ignored /// /// The \c pad argument is currently not used. BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols, size_type annz, ScalarType* vals, OrdinalType* rows, OrdinalType* cols, OrdinalType blockdim, bool pad = false) { - (void)label; (void)pad; blockDim_ = blockdim; @@ -517,6 +522,16 @@ class BsrMatrix { "BsrMatrix:: annz should be a multiple of the number of entries in a " "block"); } + if (annz % (blockDim_ * blockDim_)) { + throw std::runtime_error( + "BsrMatrix:: annz should be a multiple of the number of entries in a " + "block"); + } + if (annz % (blockDim_ * blockDim_)) { + throw std::runtime_error( + "BsrMatrix:: annz should be a multiple of the number of entries in a " + "block"); + } using Coord = std::pair; // row, col using CoordComp = std::function, + std::is_same_v, "A blocks must be stored layout-right"); rocsparse_direction dir = rocsparse_direction_row; diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index b2883c1e91..695f03e67f 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -373,7 +373,8 @@ void test_spmv_combos(const char *mode, const Bsr &a) { auto [x, y] = random_vecs_for_spmv(mode, a); - for (auto alg : {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (auto alg : + {(const char *)(nullptr), "native", "experimental_tc", "v4.1", "v4.2"}) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), @@ -569,8 +570,8 @@ void test_spm_mv_combos(const char *mode, const Bsr &a) { for (size_t numVecs : {1, 2, 7}) { // num multivecs auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); - for (auto alg : - {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (auto alg : {(const char *)(nullptr), "native", "experimental_tc", + "v4.1", "v4.2"}) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), From e9a53dc04e7a02194ab7a6a8daa959b1193f10f2 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 25 Jul 2023 11:36:04 -0600 Subject: [PATCH 092/231] BLAS: fix assignable check in gemv and gemm This is actually not a required pre-requiste we only need to check that the views are accessible. --- blas/src/KokkosBlas2_gemv.hpp | 8 -------- blas/src/KokkosBlas3_gemm.hpp | 8 -------- 2 files changed, 16 deletions(-) diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index a8ebf02ca3..988426aea9 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -82,14 +82,6 @@ void gemv(const execution_space& space, const char trans[], Kokkos::SpaceAccessibility::accessible, "KokkosBlas::gemv: YViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemv: AViewType must be assignable to YViewType"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemv: XViewType must be assignable to YViewType"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index 0cb00c8493..febd39b149 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -142,14 +142,6 @@ void gemm(const execution_space& space, const char transA[], Kokkos::SpaceAccessibility::accessible, "KokkosBlas::gemm: CViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemm: CViewType must be assignable by AViewType"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemm: CViewType must be assignable by BViewType"); // Check validity of transpose argument bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || From 1ca81655a4a30d6b5d05b3efff155fdef8f49ba0 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Tue, 25 Jul 2023 16:37:43 -0600 Subject: [PATCH 093/231] mdf: fix initial value in select pivot functor --- sparse/impl/KokkosSparse_mdf_impl.hpp | 15 ++++---- sparse/src/KokkosSparse_mdf.hpp | 55 ++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 73f77685af..3adb42454b 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -356,9 +356,7 @@ struct MDF_select_row { } KOKKOS_INLINE_FUNCTION - void init(value_type& dst) const { - dst = Kokkos::ArithTraits::zero(); - } + void init(value_type& dst) const { dst = factorization_step; } }; // MDF_select_row @@ -567,15 +565,13 @@ struct MDF_compute_list_length { KOKKOS_INLINE_FUNCTION void operator()(const team_member_t team, ordinal_type& update_list_len, ordinal_type& selected_row_len) const { - const ordinal_type selected_row = permutation(selected_row_idx); - - const auto rowView = A.rowConst(selected_row); - const auto colView = At.rowConst(selected_row); + ordinal_type selected_row = 0; size_type U_entryIdx = row_mapU(factorization_step); size_type L_entryIdx = row_mapL(factorization_step); Kokkos::single(Kokkos::PerTeam(team), [&] { + selected_row = permutation(selected_row_idx); discarded_fill(selected_row) = Kokkos::ArithTraits::max(); // Swap entries in permutation vectors @@ -595,6 +591,11 @@ struct MDF_compute_list_length { }); ++L_entryIdx; + // Only one thread has the selected row + team.team_reduce(Kokkos::Max(selected_row)); + const auto rowView = A.rowConst(selected_row); + const auto colView = At.rowConst(selected_row); + // Insert the upper part of the selected row in U // including the diagonal term. ordinal_type updateIdx = 0; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index c55b2cfa1c..ee8139d6ac 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -61,10 +61,28 @@ void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { return; } // mdf_symbolic +template +void mdf_print_joined_view( + const view_t& dev_view, const char* sep, + ordinal_t max_count = Kokkos::ArithTraits::max()) { + const auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dev_view); + + max_count = max_count > (ordinal_t)host_view.extent(0) + ? (ordinal_t)host_view.extent(0) + : max_count; + for (ordinal_t i = 0; i < max_count; ++i) { + if (i) printf("%s", sep); + printf("%g", static_cast(host_view[i])); + } +} + template void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; + using scalar_mag_type = + typename KokkosSparse::Impl::MDF_types::scalar_mag_type; using values_mag_type = typename KokkosSparse::Impl::MDF_types::values_mag_type; using ordinal_type = typename crs_matrix_type::ordinal_type; @@ -107,11 +125,11 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { for (ordinal_type factorization_step = 0; factorization_step < A.numRows(); ++factorization_step) { if (verbosity_level > 0) { - printf("\n\nFactorization step %d\n\n", + printf("\n\nFactorization step %d\n", static_cast(factorization_step)); } - { + if (update_list_len > 0) { team_range_policy_type updatePolicy(update_list_len, Kokkos::AUTO, Kokkos::AUTO); KokkosSparse::Impl::MDF_discarded_fill_norm @@ -122,6 +140,17 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { MDF_update_df_norm); } + if (verbosity_level > 1) { + if constexpr (std::is_arithmetic_v) { + printf(" discarded_fill = {"); + mdf_print_joined_view(discarded_fill, ", "); + printf("}\n"); + } + printf(" deficiency = {"); + mdf_print_joined_view(deficiency, ", "); + printf("}\n"); + } + ordinal_type selected_row_idx = 0; { range_policy_type stepPolicy(factorization_step, Atmp.numRows()); @@ -147,6 +176,24 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { updateList, update_list_len, selected_row_len); } + if (verbosity_level > 1) { + printf(" updateList = {"); + mdf_print_joined_view(update_list, ", ", update_list_len); + printf("}\n permutation = {"); + mdf_print_joined_view(handle.permutation, ", "); + printf("}\n permutation_inv = {"); + mdf_print_joined_view(handle.permutation_inv, ", "); + printf("}\n"); + } + if (verbosity_level > 0) { + printf( + " Selected row idx %d with length %d. Requires update of %d fill " + "norms.\n", + static_cast(selected_row_idx), + static_cast(selected_row_len), + static_cast(update_list_len)); + } + // If this was the last row no need to update A and At! if (factorization_step < A.numRows() - 1) { team_range_policy_type factorizePolicy(selected_row_len, Kokkos::AUTO, @@ -159,10 +206,6 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { Kokkos::parallel_for("MDF: factorize row", factorizePolicy, factorize_row); } - - if (verbosity_level > 0) { - printf("\n"); - } } // Loop over factorization steps KokkosSparse::Impl::MDF_reindex_matrix reindex_U( From 98d5a24e2c53cd936ed84ac268ed6b861017baf8 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 12:11:08 -0600 Subject: [PATCH 094/231] sparse/impl: Make PSGS non-blocking - This change also slightly improves performance perf_test/sparse: Add launch and compute timers --- perf_test/sparse/KokkosSparse_gs.cpp | 26 +++++-- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 68 ++++++++++++------- 2 files changed, 63 insertions(+), 31 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index c11c6bdc02..119941cebc 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -219,14 +219,22 @@ void runGS(const GS_Parameters& params) { KokkosSparse::Experimental::gauss_seidel_symbolic( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric); - double symbolicTime = timer.seconds(); - std::cout << "\n*** Symbolic time: " << symbolicTime << '\n'; + double symbolicLaunchTime = timer.seconds(); + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double symbolicComputeTime = timer.seconds(); + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; timer.reset(); KokkosSparse::Experimental::gauss_seidel_numeric( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric); - double numericTime = timer.seconds(); - std::cout << "\n*** Numeric time: " << numericTime << '\n'; + double numericLaunchTime = timer.seconds(); + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double numericComputeTime = timer.seconds(); + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; timer.reset(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { @@ -246,8 +254,14 @@ void runGS(const GS_Parameters& params) { true, true, 1.0, params.sweeps); break; } - double applyTime = timer.seconds(); - std::cout << "\n*** Apply time: " << applyTime << '\n'; + + double applyLaunchTime = timer.seconds(); + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double applyComputeTime = timer.seconds(); + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; + timer.reset(); kh.destroy_gs_handle(); // Now, compute the 2-norm of residual scalar_view_t res("Ax-b", nrows); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 4830011dfc..abaa01effe 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1787,25 +1787,31 @@ class PointGaussSeidel { if (block_size == 1) { Kokkos::parallel_for( labelRegular, - team_policy_t((numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + team_policy_t((numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else if (gs.num_max_vals_in_l2 == 0) { Kokkos::parallel_for( labelBlock, - block_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + block_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else { Kokkos::parallel_for( labelBigBlock, - bigblock_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + bigblock_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } } @@ -1827,13 +1833,17 @@ class PointGaussSeidel { Kokkos::deep_copy(long_row_x, nnz_scalar_t()); Kokkos::parallel_for( labelLong, - longrow_apply_team_policy_t(numLongRows * teams_per_row, - longRowTeamSize), + Kokkos::Experimental::require( + longrow_apply_team_policy_t(numLongRows * teams_per_row, + longRowTeamSize), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_policy_t(color_index_end - numLongRows, - color_index_end), + Kokkos::Experimental::require( + range_policy_t(color_index_end - numLongRows, + color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); @@ -1882,10 +1892,13 @@ class PointGaussSeidel { nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows; if (numRegularRows) { - Kokkos::parallel_for(labelShort, - range_policy_t(color_index_begin, - color_index_end - numLongRows), - gs); + Kokkos::parallel_for( + labelShort, + Kokkos::Experimental::require( + range_policy_t(color_index_begin, + color_index_end - numLongRows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); } if (numLongRows) { gs._color_set_begin = color_index_end - numLongRows; @@ -1900,14 +1913,19 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(long_row_x, nnz_scalar_t()); - Kokkos::parallel_for(labelLong, - Kokkos::RangePolicy( - 0, numLongRows * par_per_row), - gs); + Kokkos::parallel_for( + labelLong, + Kokkos::Experimental::require( + Kokkos::RangePolicy( + 0, numLongRows * par_per_row), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_policy_t(color_index_end - numLongRows, - color_index_end), + Kokkos::Experimental::require( + range_policy_t(color_index_end - numLongRows, + color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); From 890f5ffa8fb6e4c6475985b08d599dd281397798 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 12:11:08 -0600 Subject: [PATCH 095/231] sparse/impl: Make PSGS non-blocking - This change also slightly improves performance perf_test/sparse: Add launch and compute timers Conflicts: sparse/impl/KokkosSparse_gauss_seidel_impl.hpp --- perf_test/sparse/KokkosSparse_gs.cpp | 26 +++++++-- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 57 ++++++++++++------- 2 files changed, 57 insertions(+), 26 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index c11c6bdc02..119941cebc 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -219,14 +219,22 @@ void runGS(const GS_Parameters& params) { KokkosSparse::Experimental::gauss_seidel_symbolic( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric); - double symbolicTime = timer.seconds(); - std::cout << "\n*** Symbolic time: " << symbolicTime << '\n'; + double symbolicLaunchTime = timer.seconds(); + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double symbolicComputeTime = timer.seconds(); + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; timer.reset(); KokkosSparse::Experimental::gauss_seidel_numeric( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric); - double numericTime = timer.seconds(); - std::cout << "\n*** Numeric time: " << numericTime << '\n'; + double numericLaunchTime = timer.seconds(); + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double numericComputeTime = timer.seconds(); + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; timer.reset(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { @@ -246,8 +254,14 @@ void runGS(const GS_Parameters& params) { true, true, 1.0, params.sweeps); break; } - double applyTime = timer.seconds(); - std::cout << "\n*** Apply time: " << applyTime << '\n'; + + double applyLaunchTime = timer.seconds(); + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double applyComputeTime = timer.seconds(); + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; + timer.reset(); kh.destroy_gs_handle(); // Now, compute the 2-norm of residual scalar_view_t res("Ax-b", nrows); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index e4cfb4b047..0f03eb04b3 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1778,25 +1778,31 @@ class PointGaussSeidel { if (block_size == 1) { Kokkos::parallel_for( labelRegular, - team_policy_t((numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + team_policy_t((numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else if (gs.num_max_vals_in_l2 == 0) { Kokkos::parallel_for( labelBlock, - block_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + block_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else { Kokkos::parallel_for( labelBigBlock, - bigblock_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + bigblock_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } } @@ -1818,12 +1824,16 @@ class PointGaussSeidel { Kokkos::deep_copy(long_row_x, nnz_scalar_t()); Kokkos::parallel_for( labelLong, - longrow_apply_team_policy_t(numLongRows * teams_per_row, - longRowTeamSize), + Kokkos::Experimental::require( + longrow_apply_team_policy_t(numLongRows * teams_per_row, + longRowTeamSize), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::require( + range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); @@ -1874,7 +1884,9 @@ class PointGaussSeidel { if (numRegularRows) { Kokkos::parallel_for( labelShort, - range_pol(color_index_begin, color_index_end - numLongRows), + Kokkos::Experimental::require( + range_pol(color_index_begin, color_index_end - numLongRows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } if (numLongRows) { @@ -1890,13 +1902,18 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(long_row_x, nnz_scalar_t()); - Kokkos::parallel_for(labelLong, - Kokkos::RangePolicy( - 0, numLongRows * par_per_row), - gs); + Kokkos::parallel_for( + labelLong, + Kokkos::Experimental::require( + Kokkos::RangePolicy( + 0, numLongRows * par_per_row), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::require( + range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); From 0aa320fc0eaab595faf64812b48c578c234b1338 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 13:32:44 -0600 Subject: [PATCH 096/231] Cleanup and use overload pattern --- common/src/KokkosKernels_SimpleUtils.hpp | 34 +++++++++--- common/src/KokkosKernels_Utils.hpp | 54 ++++++++++++++++--- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 12 ++--- 3 files changed, 76 insertions(+), 24 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 0c9e82773a..64735874c6 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -142,22 +142,42 @@ inline void kk_exclusive_parallel_prefix_sum( kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } -/*** - * \brief Function performs the inclusive parallel prefix sum. That is each - * entry holds the sum until itself including itself. \param num_elements: size - * of the array \param arr: the array for which the prefix sum will be - * performed. - */ +template +void kk_inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, + forward_array_type arr) {} + +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param my_exec_space: The execution space instance +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// template void kk_inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, - forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { + forward_array_type arr) { typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// +template +void kk_inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); +} + template struct ReductionFunctor { view_t array_sum; diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index c8cc284b73..11b4100f31 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -456,10 +456,19 @@ struct Fill_Reverse_Map { template void inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, - forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { - kk_inclusive_parallel_prefix_sum( - num_elements, arr, my_exec_space); + forward_array_type arr) { + return kk_inclusive_parallel_prefix_sum( + my_exec_space, num_elements, arr); +} + +template +void inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template @@ -661,6 +670,7 @@ struct StridedCopy { template void create_reverse_map( + MyExecSpace my_exec_space, const typename reverse_array_type::value_type &num_forward_elements, // num_vertices const typename forward_array_type::value_type @@ -668,8 +678,7 @@ void create_reverse_map( const forward_array_type &forward_map, // vertex to colors reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj, - MyExecSpace my_exec_space = MyExecSpace()) { // colros to vertex adj + reverse_array_type &reverse_map_adj) { // colros to vertex adj typedef typename reverse_array_type::value_type lno_t; typedef typename forward_array_type::value_type reverse_lno_t; @@ -751,6 +760,23 @@ void create_reverse_map( } } +template +void create_reverse_map( + const typename reverse_array_type::value_type + &num_forward_elements, // num_vertices + const typename forward_array_type::value_type + &num_reverse_elements, // num_colors + + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { + MyExecSpace my_exec_space; + return create_reverse_map(my_exec_space, num_forward_elements, + num_reverse_elements, forward_map, reverse_map_xadj, + reverse_map_adj); +} + template struct PermuteVector { @@ -1256,11 +1282,11 @@ struct ReduceRowSizeFunctor { // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(const size_t num_rows, +void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, + const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, - size_type &max_row_size, - MyExecSpace my_exec_space = MyExecSpace()) { + size_type &max_row_size) { typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceMaxRowSize", @@ -1269,6 +1295,18 @@ void kk_view_reduce_max_row_size(const size_t num_rows, max_row_size); } +// view has num_rows+1 elements. +template +void kk_view_reduce_max_row_size(const size_t num_rows, + const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, + size_type &max_row_size) { + MyExecSpace my_exec_space; + return kk_view_reduce_max_row_size(my_exec_space, num_rows, + rowmap_view_begins, rowmap_view_ends, + max_row_size); +} + template struct ReduceMaxRowFunctor { view_type rowmap_view; diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index abaa01effe..196a0de4b4 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -899,7 +899,7 @@ class PointGaussSeidel { KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, numColors, colors, color_xadj, color_adj, my_exec_space); + my_exec_space, num_rows, numColors, colors, color_xadj, color_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE my_exec_space.fence(); @@ -988,7 +988,7 @@ class PointGaussSeidel { KokkosKernels::Impl::inclusive_parallel_prefix_sum< row_lno_persistent_work_view_t, MyExecSpace>( - num_rows + 1, permuted_xadj, my_exec_space); + my_exec_space, num_rows + 1, permuted_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE my_exec_space.fence(); @@ -1096,7 +1096,7 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. - // TODO: Need to account for number of streams here? + // TODO: account for number of streams via handle.nstreams size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< @@ -1600,9 +1600,6 @@ class PointGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); - // Kokkos::parallel_for( range_policy_t(0,nr), - // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( @@ -1682,9 +1679,6 @@ class PointGaussSeidel { apply_backward); } - // Kokkos::parallel_for( range_policy_t(0,nr), - // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( From b6e7eb37531afa6f238ffd0695c381a2c48bd76f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 15:51:59 -0600 Subject: [PATCH 097/231] Cleanup and use overload pattern --- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 196a0de4b4..a6c5e94184 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1019,8 +1019,8 @@ class PointGaussSeidel { // first calculate max row size. size_type max_row_size = 0; KokkosKernels::Impl::kk_view_reduce_max_row_size( - num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, - max_row_size, my_exec_space); + my_exec_space, num_rows, permuted_xadj.data(), + permuted_xadj.data() + 1, max_row_size); nnz_lno_t brows = permuted_xadj.extent(0) - 1; size_type bnnz = permuted_adj.extent(0) * block_size * block_size; From 5340cebef9d12bcf586b40a4cfe4faa7b68eea38 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 15:54:25 -0600 Subject: [PATCH 098/231] Add GS handle overloads --- sparse/src/KokkosKernels_Handle.hpp | 17 ++++- .../src/KokkosSparse_gauss_seidel_handle.hpp | 71 +++++++++++++++---- 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index dae3f12462..307ff7b91c 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -602,6 +602,7 @@ class KokkosKernelsHandle { return cgs; } void create_gs_handle( + HandleExecSpace handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { @@ -610,10 +611,20 @@ class KokkosKernelsHandle { // ---------------------------------------- // // Two-stage Gauss-Seidel if (gs_algorithm == KokkosSparse::GS_TWOSTAGE) - this->gsHandle = new TwoStageGaussSeidelHandleType(); - else this->gsHandle = - new PointGaussSeidelHandleType(gs_algorithm, coloring_algorithm); + new TwoStageGaussSeidelHandleType(handle_exec_space, num_streams); + else + this->gsHandle = new PointGaussSeidelHandleType( + handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); + } + + void create_gs_handle( + KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algorithm = + KokkosGraph::COLORING_DEFAULT) { + HandleExecSpace handle_exec_space; + return create_gs_handle(handle_exec_space, 1, gs_algorithm, + coloring_algorithm); } // ---------------------------------------- // // Two-stage Gauss-Seidel handle diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 6e57a23ee2..51ad48b580 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -85,6 +85,7 @@ class GaussSeidelHandle { protected: HandleExecSpace execution_space; + int num_streams; GSAlgorithm algorithm_type; @@ -104,6 +105,20 @@ class GaussSeidelHandle { */ GaussSeidelHandle(GSAlgorithm gs) : execution_space(HandleExecSpace()), + num_streams(1), + algorithm_type(gs), + color_xadj(), + color_adj(), + numColors(0), + called_symbolic(false), + called_numeric(false), + suggested_vector_size(0), + suggested_team_size(0) {} + + GaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs) + : execution_space(handle_exec_space), + num_streams(n_streams), algorithm_type(gs), color_xadj(), color_adj(), @@ -131,15 +146,6 @@ class GaussSeidelHandle { bool is_symbolic_called() const { return this->called_symbolic; } bool is_numeric_called() const { return this->called_numeric; } - // setters - void set_execution_space(const HandleExecSpace exec_space) { - static bool is_exec_space_set = false; - if (!is_exec_space_set) { - this->execution_space = exec_space; - is_exec_space_set = true; - } - } - void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; @@ -257,10 +263,10 @@ class PointGaussSeidelHandle /** * \brief Default constructor. */ - PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + PointGaussSeidelHandle(GSHandle gs_handle, KokkosGraph::ColoringAlgorithm coloring_algo_ = KokkosGraph::COLORING_DEFAULT) - : GSHandle(gs), + : GSHandle(gs_handle), permuted_xadj(), permuted_adj(), permuted_adj_vals(), @@ -276,9 +282,22 @@ class PointGaussSeidelHandle level_2_mem(0), long_row_threshold(0), coloring_algo(coloring_algo_) { - if (gs == GS_DEFAULT) this->choose_default_algorithm(); + if (gs_handle.get_algorithm_type() == GS_DEFAULT) + this->choose_default_algorithm(); } + PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(gs), coloring_algo_) {} + + PointGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(handle_exec_space, n_streams, gs), + coloring_algo_) {} + void set_block_size(nnz_lno_t bs) { this->block_size = bs; } nnz_lno_t get_block_size() const { return this->block_size; } @@ -626,8 +645,15 @@ class TwoStageGaussSeidelHandle ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>; - TwoStageGaussSeidelHandle() - : GSHandle(GS_TWOSTAGE), + using HandleExecSpace = typename GSHandle::HandleExecSpace; + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param gsh The GaussSeidel handle. + */ + TwoStageGaussSeidelHandle(GSHandle gs_handle) + : GSHandle(gs_handle), nrows(0), nrhs(1), direction(GS_SYMMETRIC), @@ -639,6 +665,23 @@ class TwoStageGaussSeidelHandle inner_omega = one; } + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + */ + TwoStageGaussSeidelHandle() + : TwoStageGaussSeidelHandle(GSHandle(GS_TWOSTAGE)) {} + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param handle_exec_space The execution space instance + * @param n_streams the number of streams + */ + TwoStageGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams) + : TwoStageGaussSeidelHandle( + GSHandle(handle_exec_space, n_streams, GS_TWOSTAGE)) {} + // Sweep direction void setSweepDirection(GSDirection direction_) { this->direction = direction_; From c0fb396b75cacd00f5614efd8b0f6e409ba9215c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 16:13:55 -0600 Subject: [PATCH 099/231] Account for streams in memory allocs --- common/src/KokkosKernels_ExecSpaceUtils.hpp | 37 ++++++++++++------- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 6 +-- .../src/KokkosSparse_gauss_seidel_handle.hpp | 2 + 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index a0f6e39f4d..a30b2e777d 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -150,7 +150,8 @@ kk_is_a64fx_mem_space() { // Will throw if execution space doesn't support this. template inline void kk_get_free_total_memory(size_t& /* free_mem */, - size_t& /* total_mem */) { + size_t& /* total_mem */, + int /* n_streams */) { std::ostringstream oss; oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; @@ -160,26 +161,32 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */, #ifdef KOKKOS_ENABLE_CUDA template <> inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { + size_t& total_mem, + int n_streams = 1) { cudaMemGetInfo(&free_mem, &total_mem); + free_mem /= n_streams; + total_mem /= n_streams; } template <> inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + size_t& total_mem, + int n_streams = 1) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + size_t& free_mem, size_t& total_mem, int n_streams = 1) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } #endif #ifdef KOKKOS_ENABLE_HIP template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { + size_t& free_mem, size_t& total_mem, int n_streams = 1) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); + free_mem /= n_streams; + total_mem /= n_streams; } #endif @@ -188,7 +195,7 @@ inline void kk_get_free_total_memory( #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { + size_t& free_mem, size_t& total_mem, int n_streams = 1) { sycl::queue queue; sycl::device device = queue.get_device(); auto level_zero_handle = @@ -220,20 +227,22 @@ inline void kk_get_free_total_memory( total_mem += memory_states.size; free_mem += memory_states.free; } + free_mem /= n_streams; + total_mem /= n_streams; } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + size_t& free_mem, size_t& total_mem, int n_streams = 1) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + size_t& free_mem, size_t& total_mem, int n_streams = 1) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } #endif diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index a6c5e94184..41809203e2 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -826,6 +826,7 @@ class PointGaussSeidel { auto gsHandle = get_gs_handle(); const size_type longRowThreshold = gsHandle->get_long_row_threshold(); const MyExecSpace my_exec_space = gsHandle->get_execution_space(); + const int num_streams = gsHandle->get_num_streams(); // Validate settings if (gsHandle->get_block_size() > 1 && longRowThreshold > 0) @@ -1096,12 +1097,11 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. - // TODO: account for number of streams via handle.nstreams size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< - typename pool_memory_space::memory_space>(free_byte, - total_byte); + typename pool_memory_space::memory_space>(free_byte, total_byte, + num_streams); size_t required_size = size_t(num_big_rows) * level_2_mem; if (required_size + num_big_rows * sizeof(int) > free_byte) { num_big_rows = diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 51ad48b580..447d96d2a1 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -131,6 +131,8 @@ class GaussSeidelHandle { virtual ~GaussSeidelHandle() = default; // getters + int get_num_streams() const { return num_streams; } + HandleExecSpace get_execution_space() const { return this->execution_space; } GSAlgorithm get_algorithm_type() const { return this->algorithm_type; } From 171ef7d9fc7665c6d9803d28e214f791d9039557 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Mon, 24 Jul 2023 13:58:07 -0500 Subject: [PATCH 100/231] Add TPL oneMKL GEMV support --- blas/src/KokkosBlas2_gemv.hpp | 9 ++ blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 41 +++++++ blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 103 ++++++++++++++++++ 3 files changed, 153 insertions(+) diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index a8ebf02ca3..444f878424 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -168,6 +168,15 @@ void gemv(const execution_space& space, const char trans[], std::is_same::value); #endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#ifdef KOKKOS_ENABLE_SYCL + // oneMKL supports both row-major and column-major of A + useFallback = + useFallback || !std::is_same_v; +#endif +#endif + if (useFallback) { const bool eti_spec_avail = KokkosBlas::Impl::gemv_eti_spec_avail::value; diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index f203be944f..1f5dde5b04 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -158,6 +158,47 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) #endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#ifdef KOKKOS_ENABLE_SYCL + +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ + template <> \ + struct gemv_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutLeft) + +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutRight) + +#endif + +#endif + } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 95e589bbf0..7aa854b962 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -786,4 +786,107 @@ KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +// ONEMKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { + switch (toupper(mode_kk)) { + case 'N': return oneapi::mkl::transpose::nontrans; + case 'T': return oneapi::mkl::transpose::trans; + case 'C': return oneapi::mkl::transpose::conjtrans; + default:; + } + throw std::invalid_argument( + "Invalid mode for oneMKL (should be one of N, T, C)"); +} + +template +struct kokkos_to_std_type_map { + using type = T; +}; + +// e.g., map Kokkos::complex to std::complex +template +struct kokkos_to_std_type_map { + using type = std::complex::mag_type>; +}; + +#define KOKKOSBLAS2_GEMV_ONEMKL(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GEMV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using device_type = Kokkos::Device; \ + using mem_traits = Kokkos::MemoryTraits; \ + using AViewType = \ + Kokkos::View; \ + using XViewType = \ + Kokkos::View; \ + using YViewType = Kokkos::View; \ + \ + static void gemv(const execution_space& exec, const char kk_trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + bool row_major = std::is_same::value; \ + const std::int64_t M = A.extent(0); \ + const std::int64_t N = A.extent(1); \ + oneapi::mkl::transpose trans = mode_kk_to_onemkl(kk_trans[0]); \ + const std::int64_t LDA = row_major ? A.stride(0) : A.stride(1); \ + std::string label = "KokkosBlas::gemv[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + \ + Kokkos::Profiling::pushRegion(label); \ + using mag_type = kokkos_to_std_type_map< \ + SCALAR, Kokkos::ArithTraits::is_complex>::type; \ + const mag_type* a = reinterpret_cast(A.data()); \ + const mag_type* x = reinterpret_cast(X.data()); \ + mag_type* y = reinterpret_cast(Y.data()); \ + if (row_major) { \ + oneapi::mkl::blas::row_major::gemv(exec.sycl_queue(), trans, M, N, \ + alpha, a, LDA, x, 1, beta, y, 1); \ + } else { \ + oneapi::mkl::blas::column_major::gemv( \ + exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +} // namespace Impl +} // namespace KokkosBlas +#endif + #endif From c99b373c48b6889678c5b85347e0b97344deec3d Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 28 Jul 2023 11:50:53 -0600 Subject: [PATCH 101/231] remove Intel 2017 code (no longer supported) --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 109 ------------------ .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 97 ---------------- 2 files changed, 206 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 687eb81de0..78ce736173 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -198,115 +198,6 @@ inline void spm_mv_block_impl_mkl( #endif -#if (__INTEL_MKL__ == 2017) - -inline void spmv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, float* y) { - mkl_sbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const double* Avalues, - const double* x, double* y) { - mkl_dbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); - const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex8* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex8* x_mkl = reinterpret_cast(x); - MKL_Complex8* y_mkl = reinterpret_cast(y); - mkl_cbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex16* alpha_mkl = - reinterpret_cast(&alpha); - const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex16* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex16* x_mkl = reinterpret_cast(x); - MKL_Complex16* y_mkl = reinterpret_cast(y); - mkl_zbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, MKL_INT colx, MKL_INT ldx, - float* y, MKL_INT ldy) { - mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, - Arowptrs, Arowptrs + 1, x, &beta, y); -} - -inline void spm_mv_block_impl_mkl( - char mode, double alpha, double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, - const double* x, MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { - mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, - Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); -} - -inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, MKL_INT colx, - MKL_INT ldx, Kokkos::complex* y, - MKL_INT ldy) { - const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); - const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex8* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex8* x_mkl = reinterpret_cast(x); - MKL_Complex8* y_mkl = reinterpret_cast(y); - mkl_cbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); -} - -inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - MKL_INT colx, MKL_INT ldx, - Kokkos::complex* y, MKL_INT ldy) { - const MKL_Complex16* alpha_mkl = - reinterpret_cast(&alpha); - const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex16* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex16* x_mkl = reinterpret_cast(x); - MKL_Complex16* y_mkl = reinterpret_cast(y); - mkl_zbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); -} - -#endif - #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 9b37361e65..fea98c591a 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -637,103 +637,6 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, }; #endif -#if (__INTEL_MKL__ == 2017) -// MKL 2017: use old interface: mkl_?csrmv -inline char mode_kk_to_mkl(char mode_kk) { - switch (toupper(mode_kk)) { - case 'N': return 'N'; - case 'T': return 'T'; - case 'H': return 'C'; - default:; - } - throw std::invalid_argument( - "Invalid mode for MKL (should be one of N, T, H)"); -} - -inline void spmv_mkl(char mode, float alpha, float beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const float* Avalues, const float* x, float* y) { - mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_mkl(char mode, double alpha, double beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const double* Avalues, const double* x, double* y) { - mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); - const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex8* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex8* x_mkl = reinterpret_cast(x); - MKL_Complex8* y_mkl = reinterpret_cast(y); - mkl_ccsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, - Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex16* alpha_mkl = - reinterpret_cast(&alpha); - const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex16* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex16* x_mkl = reinterpret_cast(x); - MKL_Complex16* y_mkl = reinterpret_cast(y); - mkl_zcsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, - Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using AMatrix = \ - CrsMatrix, MKL_INT const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - static void spmv(const Controls&, const char mode[], \ - const coefficient_type& alpha, const AMatrix& A, \ - const XVector& x, const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ - A.graph.row_map.data(), A.graph.entries.data(), \ - A.values.data(), x.data(), y.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ - }; -#endif - #ifdef KOKKOS_ENABLE_SERIAL KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial, From 18bcdaf3fba87b5e483bd6919cdd79be8eb054bf Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 28 Jul 2023 13:59:42 -0600 Subject: [PATCH 102/231] remove triplicate sanity checks in BsrMatrix --- sparse/src/KokkosSparse_BsrMatrix.hpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index b36143c14b..8b789f66f3 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -522,16 +522,6 @@ class BsrMatrix { "BsrMatrix:: annz should be a multiple of the number of entries in a " "block"); } - if (annz % (blockDim_ * blockDim_)) { - throw std::runtime_error( - "BsrMatrix:: annz should be a multiple of the number of entries in a " - "block"); - } - if (annz % (blockDim_ * blockDim_)) { - throw std::runtime_error( - "BsrMatrix:: annz should be a multiple of the number of entries in a " - "block"); - } using Coord = std::pair; // row, col using CoordComp = std::function Date: Fri, 28 Jul 2023 17:39:54 -0600 Subject: [PATCH 103/231] SPARSE: making the BSR test compatible with 4.0 and 4.1 --- common/impl/KokkosKernels_ViewUtils.hpp | 6 ++++++ .../impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp index 2ae8fb609d..ac4abb6457 100644 --- a/common/impl/KokkosKernels_ViewUtils.hpp +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -19,6 +19,11 @@ #include "Kokkos_Core.hpp" namespace KokkosKernels::Impl { +// lbv - 07/26/2023: +// MemoryTraits::impl_value was added +// in Kokkos 4.1.00 so we should guard +// the content of this header until v4.3.0 +#if KOKKOS_VERSION >= 40100 || defined(DOXY) /*! \brief Yields a type that is View with Kokkos::Unmanaged added to the memory * traits @@ -54,6 +59,7 @@ auto make_unmanaged(const View &v) { return typename with_unmanaged::type(v); } +#endif // KOKKOS_VERSION >= 40100 } // namespace KokkosKernels::Impl #endif diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp index 9c5858a307..3ac934f5d8 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp @@ -120,6 +120,11 @@ void apply_v42(const Alpha &alpha, const AMatrix &a, const XVector &x, Kokkos::RangePolicy policy(0, y.size()); if constexpr (YVector::rank == 1) { +// lbv - 07/26/2023: +// with_unmanaged_t<...> required Kokkos 4.1.0, +// the content of this header will be guarded +// until v4.3.0 +#if KOKKOS_VERSION >= 40100 || defined(DOXY) // Implementation expects a 2D view, so create an unmanaged 2D view // with extent 1 in the second dimension using Y2D = KokkosKernels::Impl::with_unmanaged_t>; +#else + // Implementation expects a 2D view, so create an unmanaged 2D view + // with extent 1 in the second dimension + using Y2D = Kokkos::View< + typename YVector::value_type * [1], typename YVector::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; + using X2D = Kokkos::View< + typename XVector::value_type * [1], typename XVector::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits>; +#endif // KOKKOS_VERSION >= 40100 || defined(DOXY) const Y2D yu(y.data(), y.extent(0), 1); const X2D xu(x.data(), x.extent(0), 1); BsrSpmvV42NonTrans op(alpha, a, xu, beta, yu); From cfa37e5dddc424bf3afb767b45d76e288cc51b50 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 25 Jul 2023 11:34:04 -0600 Subject: [PATCH 104/231] ODE: adding newton solver The solver was implemented along with some helper utilities: solver status and solver params. Four tests are added to check the interface, the correctness of single solves and the correctness in parallel solves that mimic its use in FE/FD code. --- CMakeLists.txt | 2 +- blas/impl/KokkosBlas_Newton_impl.hpp | 212 ----------- blas/unit_test/Test_Blas.hpp | 3 - ode/impl/KokkosODE_Newton_impl.hpp | 91 +++++ ode/src/KokkosODE_Newton.hpp | 45 +++ ode/src/KokkosODE_RungeKutta.hpp | 2 +- ode/src/KokkosODE_Types.hpp | 15 + ode/unit_test/Test_ODE.hpp | 4 + ode/unit_test/Test_ODE_Newton.hpp | 542 +++++++++++++++++++++++++++ 9 files changed, 699 insertions(+), 217 deletions(-) delete mode 100644 blas/impl/KokkosBlas_Newton_impl.hpp create mode 100644 ode/impl/KokkosODE_Newton_impl.hpp create mode 100644 ode/src/KokkosODE_Newton.hpp create mode 100644 ode/unit_test/Test_ODE_Newton.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c144a8c107..77156d8ec7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,7 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - MESSAGE(STATUS "Found Kokkos at ${Kokkos_DIR}") + MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") ENDIF() INCLUDE(cmake/kokkos_backends.cmake) diff --git a/blas/impl/KokkosBlas_Newton_impl.hpp b/blas/impl/KokkosBlas_Newton_impl.hpp deleted file mode 100644 index db4b8a3a43..0000000000 --- a/blas/impl/KokkosBlas_Newton_impl.hpp +++ /dev/null @@ -1,212 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef __KOKKOSBATCHED_ODE_NEWTON_HPP__ -#define __KOKKOSBATCHED_ODE_NEWTON_HPP__ - -#include "Kokkos_Core.hpp" -#include "KokkosBatched_LU_Decl.hpp" -#include "KokkosBatched_LU_Serial_Impl.hpp" -#include "KokkosBatched_Gesv.hpp" -#include "KokkosBlas1_nrm2.hpp" -#include "KokkosBlas1_scal.hpp" -#include "KokkosBlas1_axpby.hpp" - -namespace KokkosBlas { -namespace Impl { - -enum class NewtonSolverStatus { Converged = 0, LinearSolveFailure, MaxIters }; - -std::ostream& operator<<(std::ostream& os, NewtonSolverStatus& status) { - switch (status) { - case NewtonSolverStatus::Converged: os << "Newton Solver Converged!"; break; - case NewtonSolverStatus::LinearSolveFailure: - os << "Newton: Linear Solver Failure"; - break; - case NewtonSolverStatus::MaxIters: - os << "Newton reached maximum iterations without convergence."; - break; - } - return os; -} - -/// \brief NewtonHandle -/// -/// This handle is used to pass information between the Newton Solver and -/// the calling code. -/// -/// \tparam: NormViewType: Type of view used to store the residual convergence -/// history - -template -struct NewtonHandle { - using norm_type = typename NormViewType::non_const_value_type; - - NormViewType lastResidual; // Residual of last successful iteration - typename NormViewType::HostMirror lastResidualHost; - - // NormViewType residual_norms; - // TODO: Making these public for now. Should make private and access - // via setters and getters? - int maxIters; // Maximum number of Newton steps - norm_type relativeTol; // Relative convergence tolerance - bool debug_mode; // Returns extra verbose output if true. - - NewtonHandle(int _maxIters = 25, double _relativeTol = 1.0e-6, - bool _debug = false) - : lastResidual("ending Residual norm", 1), - lastResidualHost("end res norm host", 1), - maxIters(_maxIters), - relativeTol(_relativeTol), - debug_mode(_debug) {} - - KOKKOS_FUNCTION - void set_residual(const norm_type val) const { lastResidual(0) = val; } - - KOKKOS_FUNCTION - norm_type get_residual() const { return lastResidual(0); } - - norm_type get_residual_host() const { - Kokkos::deep_copy(lastResidualHost, lastResidual); - return lastResidualHost(0); - } - -}; // NewtonHandle - -/// \brief Newton Functor: -/// Solves the nonlinear system F(x) = 0 -/// where F is a map from R^n to R^n. -/// \tparam System: Struct that allows the evaluation -/// of the residual and jacobian using the -/// residual() and jacobian() methods. -/// \tparam Matrix: rank-2 view-type -/// \tparam XVector: rank-1 view-type -/// \tparam YVector: rank-1 view-type -/// \param -/// \param X [in]: Input vector X, a rank 1 view -/// \param Y [in/out]: Output vector Y, a rank 1 view -/// -/// No nested parallel_for is used inside of the function. -/// -template -struct NewtonFunctor { - using execution_space = typename YVector::execution_space; - using yvalue_type = typename YVector::non_const_value_type; - using norm_type = typename NewtonHandleType::norm_type; - - System sys; - XVector x; - YVector rhs; - NewtonHandleType handle; - - Matrix J, tmp; - XVector update; - - NewtonFunctor(System _sys, XVector _x, YVector _rhs, - NewtonHandleType& _handle) - : sys(_sys), x(_x), rhs(_rhs), handle(_handle) { - J = Matrix("Jacobian", x.extent(0), x.extent(0)); - tmp = Matrix("Jacobian", x.extent(0), x.extent(0) + 4); - update = XVector("update", x.extent(0)); - } - - KOKKOS_INLINE_FUNCTION - NewtonSolverStatus solve() const { - norm_type norm = Kokkos::ArithTraits::zero(); - yvalue_type alpha = Kokkos::ArithTraits::one(); - handle.set_residual(-1); // init to dummy value - - // Iterate until maxIts or the tolerance is reached - for (int it = 0; it < handle.maxIters; ++it) { - // compute initial rhs - sys.residual(x, rhs); - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("NewtonFunctor: r="); - for (int k = 0; k < rhs.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", rhs(k)); - } - } - - // Solve the following linearized - // problem at each step: J*update=-rhs - // with J=du/dx, rhs=f(u_n+update)-f(u_n) - norm = KokkosBlas::serial_nrm2(rhs); - handle.set_residual(norm); - - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Iteration: %d Current res norm is: %e \n Current " - "soln is:\n", - it, (double)handle.get_residual()); - for (int k = 0; k < x.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); - } - } - - if (norm < handle.relativeTol) { - // Problem solved, exit the functor - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Newton solver converged! Ending norm is: %e \n " - "Solution x is: " - "\n", - norm); - for (int k = 0; k < x.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); - } - } - return NewtonSolverStatus::Converged; - } - - // compute LHS - sys.jacobian(x, J); - - // solve linear problem - int linSolverStat = KokkosBatched::SerialGesv< - KokkosBatched::Gesv::StaticPivoting>::invoke(J, update, rhs, tmp); - KokkosBlas::SerialScale::invoke(-1, update); - - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Print linear solve solution: \n"); - for (int k = 0; k < update.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", update(k)); - } - } - if (linSolverStat == 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Linear solve gesv returned failure! \n"); - return NewtonSolverStatus::LinearSolveFailure; - } - - // update solution // x = x + alpha*update - KokkosBlas::serial_axpy(alpha, update, x); - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Print updated solution: \n"); - for (int k = 0; k < x.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); - } - } - } - return NewtonSolverStatus::MaxIters; - } // End solve functor. -}; - -} // namespace Impl -} // namespace KokkosBlas -#endif // __KOKKOSBATCHED_ODE_NEWTON_HPP__ diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index 077c7eb870..1f4f130e8b 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -76,9 +76,6 @@ #include "Test_Blas3_trmm.hpp" #include "Test_Blas3_trsm.hpp" -// Stuff that should move later on -#include "Test_Blas_Newton.hpp" - // TPLs #include "Test_Blas_rocblas.hpp" diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp new file mode 100644 index 0000000000..f0cb90810e --- /dev/null +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -0,0 +1,91 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_NEWTON_IMPL_HPP +#define KOKKOSODE_NEWTON_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_LU_Serial_Impl.hpp" +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBlas1_nrm2.hpp" +#include "KokkosBlas1_scal.hpp" +#include "KokkosBlas1_axpby.hpp" + +#include "KokkosODE_Types.hpp" + +namespace KokkosODE { +namespace Impl { + +template +KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( + system_type& sys, const KokkosODE::Experimental::Newton_params& params, + mat_type& J, mat_type& tmp, vec_type& y0, vec_type& rhs, vec_type& update) { + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + using value_type = typename vec_type::non_const_value_type; + + // Define the type returned by nrm2 to store + // the norm of the residual. + using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename vec_type::non_const_value_type>::mag_type; + norm_type norm = Kokkos::ArithTraits::zero(); + + // LBV - 07/24/2023: for now assume that we take + // a full Newton step. Eventually this value can + // be computed using a line search algorithm to + // improve convergence for difficult problems. + const value_type alpha = Kokkos::ArithTraits::one(); + + // Iterate until maxIts or the tolerance is reached + for (int it = 0; it < params.max_iters; ++it) { // handle.maxIters; ++it) { + // compute initial rhs + sys.residual(y0, rhs); + + // Solve the following linearized + // problem at each iteration: J*update=-rhs + // with J=du/dx, rhs=f(u_n+update)-f(u_n) + norm = KokkosBlas::serial_nrm2(rhs); + + if ((norm < params.rel_tol) || + (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { + return newton_solver_status::NLS_SUCCESS; + } + + // compute LHS + sys.jacobian(y0, J); + + // solve linear problem + int linSolverStat = + KokkosBatched::SerialGesv::invoke( + J, update, rhs, tmp); + KokkosBlas::SerialScale::invoke(-1, update); + + if (linSolverStat == 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Linear solve gesv returned failure! \n"); + return newton_solver_status::LIN_SOLVE_FAIL; + } + + // update solution // x = x + alpha*update + KokkosBlas::serial_axpy(alpha, update, y0); + } + return newton_solver_status::MAX_ITER; +} + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSODE_NEWTON_IMPL_HPP diff --git a/ode/src/KokkosODE_Newton.hpp b/ode/src/KokkosODE_Newton.hpp new file mode 100644 index 0000000000..94c96e2eea --- /dev/null +++ b/ode/src/KokkosODE_Newton.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_NEWTON_HPP +#define KOKKOSODE_NEWTON_HPP + +/// \author Luc Berger-Vergiat (lberge@sandia.gov) +/// \file KokkosODE_Newton.hpp + +#include "Kokkos_Core.hpp" + +#include "KokkosODE_Types.hpp" +#include "KokkosODE_Newton_impl.hpp" + +namespace KokkosODE { +namespace Experimental { + +/// \brief Newton solver for non-linear system of equations +struct Newton { + template + KOKKOS_FUNCTION static newton_solver_status Solve( + const system_type& sys, const Newton_params& params, const mat_type& J, + const mat_type& tmp, const vec_type& y0, const vec_type& rhs, + const vec_type& update) { + return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update); + } +}; + +} // namespace Experimental +} // namespace KokkosODE + +#endif // KOKKOSODE_NEWTON_HPP diff --git a/ode/src/KokkosODE_RungeKutta.hpp b/ode/src/KokkosODE_RungeKutta.hpp index c41d79c1ef..b4711de81c 100644 --- a/ode/src/KokkosODE_RungeKutta.hpp +++ b/ode/src/KokkosODE_RungeKutta.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOSODE_RUNGEKUTTA_HPP #define KOKKOSODE_RUNGEKUTTA_HPP -/// \author Luc Berger-Vergiat (lberg@sandia.gov) +/// \author Luc Berger-Vergiat (lberge@sandia.gov) /// \file KokkosODE_RungeKutta.hpp #include "Kokkos_Core.hpp" diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp index 136ff75536..068c4b17ed 100644 --- a/ode/src/KokkosODE_Types.hpp +++ b/ode/src/KokkosODE_Types.hpp @@ -51,6 +51,21 @@ struct ODE_params { min_step_size(min_step_size_) {} }; +enum newton_solver_status { NLS_SUCCESS = 0, MAX_ITER = 1, LIN_SOLVE_FAIL = 2 }; + +struct Newton_params { + int max_iters; + double abs_tol, rel_tol; + + // Constructor that only specify the desired number of steps. + // In this case no adaptivity is provided, the time step will + // be constant such that dt = (tend - tstart) / num_steps; + KOKKOS_FUNCTION + Newton_params(const int max_iters_, const double abs_tol_, + const double rel_tol_) + : max_iters(max_iters_), abs_tol(abs_tol_), rel_tol(rel_tol_) {} +}; + } // namespace Experimental } // namespace KokkosODE #endif // KOKKOSODE_TYPES_HPP diff --git a/ode/unit_test/Test_ODE.hpp b/ode/unit_test/Test_ODE.hpp index dd929c48fc..5d4861879b 100644 --- a/ode/unit_test/Test_ODE.hpp +++ b/ode/unit_test/Test_ODE.hpp @@ -16,7 +16,11 @@ #ifndef TEST_ODE_HPP #define TEST_ODE_HPP +// Explicit integrators #include "Test_ODE_RK.hpp" #include "Test_ODE_RK_chem.hpp" +// Implicit integrators +#include "Test_ODE_Newton.hpp" + #endif // TEST_ODE_HPP diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp new file mode 100644 index 0000000000..da29d895fc --- /dev/null +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -0,0 +1,542 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_Newton.hpp" + +namespace Test { + +template +struct NewtonSolve_wrapper { + using newton_params = KokkosODE::Experimental::Newton_params; + + system_type my_nls; + newton_params params; + + vec_type x, rhs, update; + mat_type J, tmp; + status_view status; + + NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, + const vec_type& x_, const vec_type& rhs_, + const vec_type& update_, const mat_type& J_, + const mat_type& tmp_, const status_view& status_) + : my_nls(my_nls_), + params(params_), + x(x_), + rhs(rhs_), + update(update_), + J(J_), + tmp(tmp_), + status(status_) {} + + KOKKOS_FUNCTION + void operator()(const int idx) const { + // Take subviews to create the local problem + auto local_x = Kokkos::subview( + x, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1))); + auto local_rhs = Kokkos::subview( + rhs, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1))); + auto local_update = Kokkos::subview( + update, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1))); + auto local_J = Kokkos::subview( + J, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1)), + Kokkos::ALL()); + auto local_tmp = Kokkos::subview( + tmp, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1)), + Kokkos::ALL()); + + // Run Newton nonlinear solver + status(idx) = KokkosODE::Experimental::Newton::Solve( + my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update); + } +}; + +template +void run_newton_test(const system_type& mySys, + KokkosODE::Experimental::Newton_params& params, + const scalar_type* const initial_val, + const scalar_type* const solution) { + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + + Kokkos::View status("Newton status", + 1); + + vec_type x("solution vector", mySys.neqs), + rhs("right hand side vector", mySys.neqs); + auto x_h = Kokkos::create_mirror_view(x); + auto r_h = Kokkos::create_mirror_view(rhs); + + vec_type update("update", mySys.neqs); + mat_type J("jacobian", mySys.neqs, mySys.neqs), + tmp("temp mem", mySys.neqs, mySys.neqs + 4); + + // Initial values + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + x_h(eqIdx) = initial_val[eqIdx]; + } + Kokkos::deep_copy(x, x_h); + + Kokkos::RangePolicy my_policy(0, 1); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, + status); + + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto status_h = Kokkos::create_mirror_view(status); + Kokkos::deep_copy(status_h, status); + EXPECT_TRUE(status_h(0) == newton_solver_status::NLS_SUCCESS); + + Kokkos::deep_copy(x_h, x); + Kokkos::deep_copy(r_h, rhs); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Non-linear problem solution and residual:" << std::endl; + std::cout << " [("; + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + std::cout << " " << x_h(eqIdx); + } + std::cout << " ), " << KokkosBlas::serial_nrm2(rhs) << ", ("; + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + std::cout << " " + << Kokkos::abs(x_h(eqIdx) - solution[eqIdx]) / + Kokkos::abs(solution[eqIdx]); + } + std::cout << " )]" << std::endl; +#else + (void)solution; +#endif +} + +// Quadratic equation +// x^2 - x - 2 = 0 +// Solution: x = 2 or x = -1 +// Derivative 2*x - 1 +template +struct QuadraticEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 1; + + QuadraticEquation() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = y(0) * y(0) - y(0) - 2; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0) - 1; + } +}; + +// Trigonometric equation +// f(x) = cos(x) - x = 0 +// Solution: 0.739085 +// f'(x) = -sin(x) - 1 +template +struct TrigonometricEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 1; + + TrigonometricEquation() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = Kokkos::cos(y(0)) - y(0); + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = -Kokkos::sin(y(0)) - 1; + } +}; + +// Logarithmic equation +// f(x) = 7x - log(7x) - 1 = 0 +// Solution: 1/7 = 0.14285714285 +// f'(x) = 7 - (1 / x) +template +struct LogarithmicEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 1; + + LogarithmicEquation() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = 7 * y(0) - Kokkos::log(7 * y(0)) - 1; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 7 - 1 / y(0); + } +}; + +template +void test_newton_status() { + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + KokkosODE::Experimental::Newton_params params(50, abs_tol, rel_tol); + Kokkos::View status( + "newton solver status", 1); + auto status_h = Kokkos::create_mirror_view(status); + + // Create the non-linear system and initialize data + QuadraticEquation my_system{}; + + scalar_type initial_value[3] = {1.0, -0.5, 0.5}; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + scalar_type solution[3] = {2.0, -1.0, 0.0}; +#endif + newton_solver_status newton_status[3] = { + newton_solver_status::NLS_SUCCESS, newton_solver_status::MAX_ITER, + newton_solver_status::LIN_SOLVE_FAIL}; + vec_type x("solution vector", 1), rhs("right hand side vector", 1); + auto x_h = Kokkos::create_mirror_view(x); + auto r_h = Kokkos::create_mirror_view(rhs); + + vec_type update("update", 1); + mat_type J("jacobian", 1, 1), tmp("temp mem", 1, 5); + + for (int idx = 0; idx < 3; ++idx) { + params.max_iters = (idx == 1) ? 2 : 50; + Kokkos::deep_copy(x, initial_value[idx]); + + Kokkos::RangePolicy my_policy(0, 1); + NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, + status); + Kokkos::parallel_for(my_policy, solve_wrapper); + + Kokkos::deep_copy(status_h, status); + EXPECT_TRUE(status_h(0) == newton_status[idx]); + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + Kokkos::deep_copy(x_h, x); + Kokkos::deep_copy(r_h, rhs); + printf("Non-linear problem solution and residual with initial value %f:\n", + initial_value[idx]); + printf(" [%f, %g, %g]\n", x_h(0), r_h(0), + Kokkos::abs(x_h(0) - solution[idx]) / Kokkos::abs(solution[idx])); +#endif + } +} + +template +void test_simple_problems() { + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + KokkosODE::Experimental::Newton_params params(50, abs_tol, rel_tol); + + { + // Test the Newton solver on a quadratci equation + // with two different initial guess that lead to + // the two solutions of the equation. +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Quadratic Equation problem" << std::endl; +#endif + using system_type = QuadraticEquation; + system_type mySys{}; + scalar_type initial_value[2] = {1.0, -0.5}, solution[2] = {2.0, -1.0}; + for (int idx = 0; idx < 2; ++idx) { + run_newton_test( + mySys, params, &(initial_value[idx]), &(solution[idx])); + } +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Quadratic Equation problem" << std::endl; +#endif + } + + { + // Test the Newton solver on a trigonometric equation +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Trigonometric Equation problem" << std::endl; +#endif + using system_type = TrigonometricEquation; + system_type mySys{}; + scalar_type initial_value[1] = {0.1}, solution[1] = {0.739085}; + run_newton_test( + mySys, params, initial_value, solution); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Trigonometric Equation problem" << std::endl; +#endif + } + + { + // Test the Newton solver on a logarithmic equation +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Logarithmic Equation problem" << std::endl; +#endif + using system_type = LogarithmicEquation; + system_type mySys{}; + scalar_type initial_value[1] = {static_cast(0.5)}, + solution[1] = {static_cast(1.0) / + static_cast(7.0)}; + run_newton_test( + mySys, params, initial_value, solution); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Logarithmic Equation problem" << std::endl; +#endif + } +} + +/////////////////////////////////////// +// Now solving systems of equations // +// To make things more realistic and // +// interesting. // +/////////////////////////////////////// + +// Intersections of two circles +// Equations: f0 = (x-0)**2 + (y-0)**2 - 4.00 = 0 +// f1 = (x-3)**2 + (y-0)**2 - 2.25 = 0 +// +// Jacobian: J00 = 2*x J01 = 2*y +// J10 = 2*(x-3) J11 = 2*y +// +// Solution: x = 10.75/6 y = +/- sqrt(2.25 + 7.25/6) +// ~ 1.7916666 ~ +/- 0.8887803753 +template +struct CirclesIntersections { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 2; + + CirclesIntersections() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = y(0) * y(0) + y(1) * y(1) - 4; + f(1) = (y(0) - 3) * (y(0) - 3) + y(1) * y(1) - 2.25; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0); + jac(0, 1) = 2 * y(1); + jac(1, 0) = 2 * (y(0) - 3); + jac(1, 1) = 2 * y(1); + } +}; + +// Intersections of a circle and an hyperbola +// Equations: f0 = x**2 + y**2 - 4.00 = 0 +// f1 = x*y - 1 = 0 --> also y = 1 / x +// +// Jacobian: J00 = 2*x J01 = 2*y +// J10 = y J11 = x +// +// Solution: x = +/- sqrt( (4 +/- sqrt(12)) / 2); y = 1 / x +// x0~ 1.9318516525 y0~ 0.5176380902 +// x1~ 0.5176380902 y1~ 1.9318516525 +// x2~ -0.5176380902 y2~ -1.9318516525 +// x3~ -1.9318516525 y3~ -0.5176380902 +template +struct CircleHyperbolaIntersection { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 2; + + CircleHyperbolaIntersection() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = y(0) * y(0) + y(1) * y(1) - 4; + f(1) = y(0) * y(1) - 1; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0); + jac(0, 1) = 2 * y(1); + jac(1, 0) = y(1); + jac(1, 1) = y(0); + } +}; + +template +void test_simple_systems() { + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + KokkosODE::Experimental::Newton_params params(50, abs_tol, rel_tol); + + { + // First problem: intersection of two circles +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Circles Intersetcion problem" << std::endl; +#endif + using system_type = CirclesIntersections; + system_type mySys{}; + scalar_type initial_values[2] = {1.5, 1.5}; + scalar_type solution[2] = {10.75 / 6, 0.8887803753}; + run_newton_test( + mySys, params, initial_values, solution); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Circles Intersetcion problem" << std::endl; +#endif + } + + { + // Second problem: circle / hyperbola intersection +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Circle/Hyperbola Intersetcion problem" + << std::endl; +#endif + using system_type = + CircleHyperbolaIntersection; + system_type mySys{}; + + scalar_type init_vals[2] = {0.0, 1.0}; + scalar_type solutions[2] = { + Kokkos::ArithTraits::one() / + Kokkos::sqrt(static_cast( + 4 + Kokkos::sqrt(static_cast(12.0)) / 2)), + Kokkos::sqrt(static_cast( + (4 + Kokkos::sqrt(static_cast(12.0))) / 2))}; + run_newton_test( + mySys, params, init_vals, solutions); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Circle/Hyperbola Intersetcion problem" << std::endl; +#endif + } +} + +//////////////////////////////////////////// +// Finally, solving systems of equations // +// within a parallel_for loop as it would // +// happen within a FE/FD code. // +//////////////////////////////////////////// + +template +void test_newton_on_device() { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + using newton_params = KokkosODE::Experimental::Newton_params; + using system_type = CircleHyperbolaIntersection; + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + + constexpr int num_systems = 1000; + const newton_params params(50, abs_tol, rel_tol); + + system_type mySys{}; + + vec_type x("solution vector", mySys.neqs * num_systems); + vec_type rhs("right hand side vector", mySys.neqs * num_systems); + vec_type update("update", mySys.neqs * num_systems); + mat_type J("jacobian", mySys.neqs * num_systems, mySys.neqs); + mat_type tmp("temp mem", mySys.neqs * num_systems, mySys.neqs + 4); + + Kokkos::View status("solver status", + num_systems); + + auto x_h = Kokkos::create_mirror_view(x); + auto r_h = Kokkos::create_mirror_view(rhs); + + // Initial values + scalar_type initial_val[2] = {0.0, 1.0}; + for (int sysIdx = 0; sysIdx < num_systems; ++sysIdx) { + x_h(2 * sysIdx) = initial_val[0]; + x_h(2 * sysIdx + 1) = initial_val[1]; + } + Kokkos::deep_copy(x, x_h); + + Kokkos::RangePolicy my_policy(0, num_systems); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, + status); + + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + + auto status_h = Kokkos::create_mirror_view(status); + Kokkos::deep_copy(status_h, status); + Kokkos::deep_copy(x_h, x); + for (int sysIdx = 0; sysIdx < num_systems; ++sysIdx) { + EXPECT_TRUE(status_h(sysIdx) == newton_solver_status::NLS_SUCCESS) + << "System " << sysIdx << " did not report a successful convergence!"; + } +} + +} // namespace Test + +// No ETI is performed for these device routines +// Just pick scalar types at will... +TEST_F(TestCategory, Newton_status_float) { + ::Test::test_newton_status(); +} +TEST_F(TestCategory, Newton_status_double) { + ::Test::test_newton_status(); +} + +TEST_F(TestCategory, Newton_simple_float) { + ::Test::test_simple_problems(); +} +TEST_F(TestCategory, Newton_simple_double) { + ::Test::test_simple_problems(); +} + +TEST_F(TestCategory, Newton_system_float) { + ::Test::test_simple_systems(); +} +TEST_F(TestCategory, Newton_system_double) { + ::Test::test_simple_systems(); +} + +TEST_F(TestCategory, Newton_parallel_float) { + ::Test::test_newton_on_device(); +} +TEST_F(TestCategory, Newton_parallel_double) { + ::Test::test_newton_on_device(); +} From 1da7a774e0555568ba46465ea1b439e97e1e4d33 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 28 Jul 2023 13:50:24 -0600 Subject: [PATCH 105/231] remove duplicate BSR SpMV tests --- sparse/src/KokkosKernels_Controls.hpp | 12 +- sparse/unit_test/Test_Sparse_spmv.hpp | 601 ---------------------- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 112 +++- 3 files changed, 95 insertions(+), 630 deletions(-) diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp index 0bb8f79ff0..1ee8cd108e 100644 --- a/sparse/src/KokkosKernels_Controls.hpp +++ b/sparse/src/KokkosKernels_Controls.hpp @@ -21,6 +21,7 @@ /// \author Luc Berger-Vergiat (lberge@sandia.gov) #include +#include #include "KokkosKernels_config.h" #include "KokkosKernels_tpl_handles_decl.hpp" @@ -43,8 +44,13 @@ namespace Experimental { // Declaration of Controls class class Controls { public: + using key_type = std::string; + using mapped_type = std::string; + using value_type = std::pair; + // Constructor Controls() = default; + Controls(std::initializer_list init) : kernel_parameters(init) {} // set a new parameter void setParameter(const std::string& name, const std::string& value) { @@ -60,8 +66,8 @@ class Controls { /// /// \param name the name of the parameter to retrieve /// \param orUnset (default \c "" ) the value to return if \c name is not set - std::string getParameter(const std::string& name, - const std::string& orUnset = "") const { + key_type getParameter(const std::string& name, + const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); if (kernel_parameters.end() == search) { return orUnset; @@ -123,7 +129,7 @@ class Controls { private: // storage for kernel parameters - std::unordered_map kernel_parameters; + std::unordered_map kernel_parameters; }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 9da0733581..16a6b56a48 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -1112,489 +1112,6 @@ void test_github_issue_101() { } } -template -CrsMat make_block_matrix(typename CrsMat::ordinal_type &numRows, - typename CrsMat::ordinal_type &numCols, - typename CrsMat::ordinal_type &blockSize) { - using lno_t = typename CrsMat::ordinal_type; - using scalar_t = typename CrsMat::value_type; - - using Kokkos::HostSpace; - using Kokkos::MemoryUnmanaged; - using Kokkos::View; - - Kokkos::Random_XorShift64 rand(13718); - - // fill outputs with random values - // Kokkos::Random_XorShift64_Pool rand_pool(13718); - // Kokkos::fill_random(hi_x, rand_pool, randomUpperBound(10)); - - std::vector values; - std::vector rowmap; - std::vector entries; - - // each row of blocks - for (lno_t bi = 0; bi < numRows; bi += blockSize) { - // target number of blocks in the row - lno_t rowBlockCount = 3; - { - // cap the number of blocks in the row - lno_t maxBlocksInRow = numCols / blockSize; - rowBlockCount = std::min(maxBlocksInRow, rowBlockCount); - } - - // where the blocks in this row of blocks start - // add that many blocks at random positions in the row - std::vector bjs; - for (int _ = 0; _ < rowBlockCount; ++_) { - bjs.push_back(rand.rand(numCols / blockSize) * blockSize); - } - - // remove duplicates - { - std::sort(bjs.begin(), bjs.end()); - auto it = std::unique(bjs.begin(), bjs.end()); - bjs.resize(it - bjs.begin()); - } - - for (lno_t i = bi; i < bi + blockSize; ++i) { - rowmap.push_back(entries.size()); // where this row starts - - // for each block - for (size_t block = 0; block < bjs.size(); ++block) { - lno_t bj = bjs[block]; - for (lno_t j = bj; j < bj + blockSize; ++j) { - entries.push_back(j); - values.push_back(rand.rand(10)); - // values.push_back(1); - } - } - } - } - - while (rowmap.size() < numRows + 1) { - rowmap.push_back(entries.size()); - } - - return CrsMat("", numRows, numCols, values.size(), values.data(), - rowmap.data(), entries.data()); -} - -struct Coordinate { - int i; - int j; - Coordinate(int _i, int _j) : i(_i), j(_j) {} - // sort by i then j - static bool by_ij(const Coordinate &a, const Coordinate &b) { - if (a.i < b.i) { - return true; - } else if (a.i > b.i) { - return false; - } else { - return a.j < b.j; - } - } -}; -struct Entry { - Coordinate c; - double e; - Entry(int i, int j, double _e) : c(i, j), e(_e) {} - static bool by_ij(const Entry &a, const Entry &b) { - return Coordinate::by_ij(a.c, b.c); - } -}; - -// expand a pattern into a blocked CrsMatrix -template ::value, bool> = true> -Matrix expand_matrix(std::vector pattern, const int m, const int k, - const int blockSize, const int seed = 0) { - typedef typename Matrix::value_type Scalar; - typedef typename Matrix::ordinal_type Ordinal; - typedef typename Matrix::non_const_size_type Offset; - typedef Kokkos::View> - UnmanagedRowmap; - typedef Kokkos::View> - UnmanagedEntries; - typedef Kokkos::View> - UnmanagedValues; - - srand(seed); - - auto gen_rand = []() -> double { return rand() % 10; }; - - // check rows and columns - for (const Coordinate &c : pattern) { - if (c.i >= m) { - KokkosKernels::Impl::throw_runtime_exception("i exceeded matrix rows"); - } - if (c.j >= k) { - KokkosKernels::Impl::throw_runtime_exception("j exceeded matrix cols"); - } - } - - // order the blocks - std::sort(pattern.begin(), pattern.end(), Coordinate::by_ij); - - // create coo entries for each block - std::vector entries; - for (const Coordinate &c : pattern) { - for (int i = 0; i < blockSize; ++i) { - for (int j = 0; j < blockSize; ++j) { - entries.push_back( - Entry(c.i * blockSize + i, c.j * blockSize + j, gen_rand())); - } - } - } - - std::sort(entries.begin(), entries.end(), Entry::by_ij); - - std::vector rowMap; - std::vector colInd; - std::vector val; - - for (Entry &e : entries) { - while (rowMap.size() < size_t(e.c.i + 1)) { // catch empty rows - rowMap.push_back(colInd.size()); - } - colInd.push_back(e.c.j); - val.push_back(e.e); - } - // possibly empty rows at end of matrix - while (rowMap.size() <= size_t(m * blockSize)) { - rowMap.push_back(colInd.size()); - } - - typename Matrix::row_map_type::non_const_type sparseRowMap("", rowMap.size()); - Kokkos::deep_copy(sparseRowMap, - UnmanagedRowmap(rowMap.data(), rowMap.size())); - typename Matrix::index_type::non_const_type sparseCols("", colInd.size()); - Kokkos::deep_copy(sparseCols, UnmanagedEntries(colInd.data(), colInd.size())); - typename Matrix::values_type::non_const_type sparseVals("", val.size()); - Kokkos::deep_copy(sparseVals, UnmanagedValues(val.data(), val.size())); - - Matrix mat("crs", m * blockSize, k * blockSize, sparseVals.size(), sparseVals, - sparseRowMap, sparseCols); - return mat; -} - -template < - typename Matrix, - std::enable_if_t::value, - bool> = true> -Matrix expand_matrix(std::vector pattern, const int m, const int k, - const int blockSize, const int seed = 0) { - typedef typename Matrix::value_type Scalar; - typedef typename Matrix::ordinal_type Ordinal; - typedef typename Matrix::non_const_size_type Offset; - typedef Kokkos::View> - UnmanagedRowmap; - typedef Kokkos::View> - UnmanagedEntries; - typedef Kokkos::View> - UnmanagedValues; - - srand(seed); - - auto gen_rand = []() -> double { return rand() % 10; }; - - // determine the number of rows and columns - // check rows and columns - for (const Coordinate &c : pattern) { - if (c.i >= m) { - KokkosKernels::Impl::throw_runtime_exception("i exceeded matrix rows"); - } - if (c.j >= k) { - KokkosKernels::Impl::throw_runtime_exception("j exceeded matrix cols"); - } - } - - // order the blocks - std::sort(pattern.begin(), pattern.end(), Coordinate::by_ij); - - // create values in order of the blocks (storage order for BSR) - std::vector val(pattern.size() * blockSize * blockSize); - for (typename std::vector::size_type idx = 0; idx < val.size(); - ++idx) { - val[idx] = gen_rand(); - } - - /* create the BsrMatrix adjacency info - use the sorted pattern. val is already in the correct storage order - */ - std::vector rowMap; - std::vector colInd; - - for (Coordinate &e : pattern) { - while (rowMap.size() < size_t(e.i + 1)) { // catch empty rows - rowMap.push_back(colInd.size()); - } - colInd.push_back(e.j); - } - // possibly empty rows at end of matrix - while (rowMap.size() <= size_t(m)) { - rowMap.push_back(colInd.size()); - } - - typename Matrix::row_map_type::non_const_type sparseRowMap("", rowMap.size()); - Kokkos::deep_copy(sparseRowMap, - UnmanagedRowmap(rowMap.data(), rowMap.size())); - typename Matrix::index_type::non_const_type sparseCols("", colInd.size()); - Kokkos::deep_copy(sparseCols, UnmanagedEntries(colInd.data(), colInd.size())); - typename Matrix::values_type::non_const_type sparseVals("", val.size()); - Kokkos::deep_copy(sparseVals, UnmanagedValues(val.data(), val.size())); - Kokkos::fence(); - - Matrix mat("bsr", m, k, sparseVals.size(), sparseVals, sparseRowMap, - sparseCols, blockSize); - return mat; -} - -/* a_scalar_t: the matrix type - x_scalar_t: the x-vector type - y_scalar_t: the y-vector type - - blockSize: the size of the dense blocks in the matrix - pattern: the non-zero locations of the blocks - m,n: the multiplication dimensions (in terms of blockSize) - k: number of vectors in the multivector - y[m*blockSize x k] = A[m*blockSize x n*blockSize] * x[n*blockSize x k] - - Compare the BsrMatrix spmv against a KokkosSparse::spmv on the same operands. - The controls are used in the BsrMatrix SpMV invocation - -*/ -template -void test_spmv_bsrmatrix_controls_pattern( - const KokkosKernels::Experimental::Controls &controls, - const std::vector &pattern, const int m, const int n, - lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta, - const int max_blocks_per_row) { - // get the widest passed scalar type - // typedef typename std::conditional= sizeof(x_scalar_t), - // a_scalar_t, x_scalar_t>::type wider_t; - // typedef typename std::conditional= sizeof(y_scalar_t), - // wider_t, y_scalar_t>::type widest_t; - - using crs_mat_t = typename KokkosSparse::CrsMatrix; - using bsr_mat_t = - typename KokkosSparse::Experimental::BsrMatrix; - using x_view_t = Kokkos::View; - using y_view_t = Kokkos::View; - - using DeviceRangePolicy = Kokkos::RangePolicy; - - crs_mat_t crs = expand_matrix(pattern, m, n, blockSize); - bsr_mat_t bsr = expand_matrix(pattern, m, n, blockSize); - - // only tue if the original matrix is a multiple of block size, and all blocks - // are dense - EXPECT_TRUE(bsr.nnz() * bsr.blockDim() * bsr.blockDim() == crs.nnz()); - EXPECT_TRUE(bsr.numRows() * bsr.blockDim() == crs.numRows()); - EXPECT_TRUE(bsr.numCols() * bsr.blockDim() == crs.numCols()); - - // expected operands - x_view_t exp_x("exp_x", n * blockSize, k); - y_view_t exp_y("exp_y", m * blockSize, k); - - // test operands - y_view_t test_y("test_y", m * blockSize, k); - x_view_t test_x("test_x", n * blockSize, k); - - constexpr x_scalar_t max_x = 10; - constexpr y_scalar_t max_y = 10; - constexpr a_scalar_t max_a = 10; - const double max_val = - beta * max_y + alpha * max_blocks_per_row * max_a * max_x; - - // fill expected with random values - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); - Kokkos::fill_random(exp_x, rand_pool, - randomUpperBound(max_x)); - Kokkos::fill_random(exp_y, rand_pool, - randomUpperBound(max_y)); - - // copy expected operands to test operands - Kokkos::deep_copy(test_x, exp_x); - Kokkos::deep_copy(test_y, exp_y); - Kokkos::fence(); - - // generate expected y vector - // some error about Blas implementation - KokkosSparse::spmv("N", alpha, crs, exp_x, beta, exp_y); - Kokkos::fence(); - - // invoke tensor-core spmv - KokkosSparse::spmv(controls, "N", alpha, bsr, test_x, beta, test_y); - Kokkos::fence(); - - // test each vector - for (lno_t ki = 0; ki < k; ++ki) { - auto exp_y_i = Kokkos::subview(exp_y, Kokkos::ALL(), ki); - auto test_y_i = Kokkos::subview(test_y, Kokkos::ALL(), ki); - - // count errors - int num_errors = 0; - // Kokkos::ArithTraits in CUDA 9 is float on the host - // for CUDA 9, Kokkos half is actually float. However, the tensor core SpMV - // uses CUDA's half type, not Kokkos, so we still need a reduced precision - // test. - double eps = - 2 * KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX; - Kokkos::parallel_reduce("KokkosSparse::Test::spmv_tc", - DeviceRangePolicy(0, exp_y_i.extent(0)), - Test::fSPMV( - exp_y_i, test_y_i, eps, max_val), - num_errors); - // explicit cast to double since no overload for half::operator<< - if (num_errors > 0) - std::cout << "KokkosSparse::Test::spmv_tc: " << num_errors - << " errors of " << exp_y_i.extent_int(0) << " for mv " << ki - << " (alpha=" - << double(Kokkos::ArithTraits::abs(alpha)) - << ", beta=" - << double(Kokkos::ArithTraits::abs(beta)) - << ", mode = N" - << ")\n"; - EXPECT_TRUE(num_errors == 0); - } -} - -/* test a particular pattern with all supported controls - */ -template -void test_spmv_bsrmatrix_pattern(const std::vector &pattern, - const int m, const int n, lno_t blockSize, - lno_t k, y_scalar_t alpha, y_scalar_t beta, - const int max_blocks_per_row) { - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "experimental_bsr_tc"); - test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - -#if defined(KOKKOS_ARCH_AMPERE) - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "experimental_bsr_tc"); - controls.setParameter("tc_precision", "double"); - test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } -#endif -} - -/* test a bunch of different matrices - */ -template -void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, - y_scalar_t beta) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "experimental_bsr_tc"); - - // 1x1 full - { - int m = 1; - int n = 1; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(0, 0)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 1x1 empty - { - int m = 1; - int n = 1; - int max_blocks_per_row = 0; - std::vector pattern = {}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x2 top-left - { - int m = 2; - int n = 2; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(0, 0)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x2 bottom right - { - int m = 2; - int n = 2; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(1, 1)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x3 bottom right - { - int m = 2; - int n = 3; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(1, 2)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x10 long bottom row - { - int m = 2; - int n = 10; - int max_blocks_per_row = 10; - std::vector pattern; - for (int j = 0; j < n; ++j) { - pattern.push_back(Coordinate(1, j)); - } - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 10x10 column 1 + diagonal - { - int m = 10; - int n = 10; - int max_blocks_per_row = 2; - std::vector pattern; - for (int i = 0; i < n; ++i) { - pattern.push_back(Coordinate(i, 1)); - if (i != 1) { - pattern.push_back(Coordinate(i, i)); - } - } - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } -} - #define EXECUTE_TEST_ISSUE_101(DEVICE) \ TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \ test_github_issue_101(); \ @@ -1664,124 +1181,6 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, 10, 2); \ } -/* Tensor Core SpMV - blocksize, k, alpha, beta -*/ -#define EXECUTE_TEST_TC(ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET, LAYOUT, \ - DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##spmv_tensor_core##_##ASCALAR##_##XSCALAR##_##YSCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ - /* easy case with different alphas and betas*/ \ - test_spmv_bsrmatrix(16, 16, 0, 0); \ - test_spmv_bsrmatrix(16, 16, 1, 0); \ - test_spmv_bsrmatrix(16, 16, 0, 1); \ - test_spmv_bsrmatrix(16, 16, 1, 1); \ - /* easy case with a real alpha/beta */ \ - test_spmv_bsrmatrix(16, 16, 1.25, -2.73); \ - /* smaller block size with k < and > block size*/ \ - test_spmv_bsrmatrix(7, 6, 1.25, -2.73); \ - test_spmv_bsrmatrix(7, 7, 1.25, -2.73); \ - test_spmv_bsrmatrix(7, 8, 1.25, -2.73); \ - /* smaller block size with k < and > block size*/ \ - test_spmv_bsrmatrix(15, 14, 1.25, -2.73); \ - test_spmv_bsrmatrix(15, 15, 1.25, -2.73); \ - test_spmv_bsrmatrix(15, 16, 1.25, -2.73); \ - /* larger block size with k < and > block size*/ \ - test_spmv_bsrmatrix(17, 16, 1.25, -2.73); \ - test_spmv_bsrmatrix(17, 17, 1.25, -2.73); \ - test_spmv_bsrmatrix(17, 18, 1.25, -2.73); \ - /* larger block size with k < and > block size*/ \ - test_spmv_bsrmatrix(32, 31, 1.25, -2.73); \ - test_spmv_bsrmatrix(32, 32, 1.25, -2.73); \ - test_spmv_bsrmatrix(32, 33, 1.25, -2.73); \ - /* more than one team per block*/ \ - test_spmv_bsrmatrix(33, 13, 1.25, -2.73); \ - test_spmv_bsrmatrix(33, 27, 1.25, -2.73); \ - test_spmv_bsrmatrix(33, 41, 1.25, -2.73); \ - } - -// minimal conditions for tensor core SpMV test -// BsrMatrix spmv is only supported on CUDA for the time being -#if defined(KOKKOS_ENABLE_CUDA) && defined(TEST_CUDA_SPARSE_CPP) && \ - (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE)) - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, float, int, size_t, LayoutLeft, -// TestExecSpace) EXECUTE_TEST_TC(kokkos_half, float, float, int, -// size_t, LayoutLeft, TestExecSpace) EXECUTE_TEST_TC(float, kokkos_half, -// float, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_TC(float, float, float, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, double, int, size_t, LayoutLeft, -// TestExecSpace) EXECUTE_TEST_TC(kokkos_half, double, double, int, -// size_t, LayoutLeft, TestExecSpace) EXECUTE_TEST_TC(double, kokkos_half, -// double, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, float, int, size_t, LayoutRight, -// TestExecSpace) EXECUTE_TEST_TC(kokkos_half, float, float, int, -// size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_TC(float, kokkos_half, -// float, int, size_t, LayoutRight, TestExecSpace) -EXECUTE_TEST_TC(float, float, float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, double, int, size_t, -// LayoutRight, TestExecSpace) EXECUTE_TEST_TC(kokkos_half, double, double, -// int, size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_TC(double, -// kokkos_half, double, int, size_t, LayoutRight, TestExecSpace) -EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#endif // tensor core SpMV tests - -#undef EXECUTE_TEST_TC - #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST_ISSUE_101(TestExecSpace) diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index 695f03e67f..f39eb407c6 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -16,7 +16,7 @@ /*! \file Test_Sparse_spmv_bsr.hpp - Test the following 768 combos for at least a few matcies. + Test the following 256 combos for at least a few matcies. Algorithms Alpha Beta Block Sizes Modes (none) 0 0 1 N @@ -25,6 +25,8 @@ 3.7 -1.5 9 H There are also a subset of tests on larger matrices + + Multivector products are also tested for these cases with 1 and 7 vectors */ #include @@ -147,9 +149,10 @@ void reference_spmv(const char *mode, const Alpha &alpha, const Bsr &a, template -void test_spmv(const char *alg, const char *mode, const Alpha &alpha, - const Beta &beta, const Bsr &a, const XVector &x, - const YVector &y) { +void test_spmv( + const std::optional &controls, + const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, + const XVector &x, const YVector &y) { using execution_space = typename Bsr::execution_space; using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; @@ -165,10 +168,8 @@ void test_spmv(const char *alg, const char *mode, const Alpha &alpha, YVector yAct("yAct", y.extent(0)); Kokkos::deep_copy(yAct, y); - if (alg) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", alg); - KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + if (controls) { + KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); } else { KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); } @@ -216,9 +217,16 @@ void test_spmv(const char *alg, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { + std::string alg; + if (controls) { + alg = controls->getParameter("algorithm", ""); + } else { + alg = ""; + } + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMV failure!" << std::endl; - std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "alg: " << alg << std::endl; std::cerr << "mode: " << mode << std::endl; std::cerr << "A: " << a.numRows() << "x" << a.numCols() << std::endl; @@ -369,17 +377,40 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a) { */ template void test_spmv_combos(const char *mode, const Bsr &a) { - using scalar_type = typename Bsr::non_const_value_type; + using scalar_type = typename Bsr::non_const_value_type; + using execution_space = typename Bsr::execution_space; auto [x, y] = random_vecs_for_spmv(mode, a); - for (auto alg : - {(const char *)(nullptr), "native", "experimental_tc", "v4.1", "v4.2"}) { + // cover a variety of controls + using Ctrls = KokkosKernels::Experimental::Controls; + using OptCtrls = std::optional; + std::vector ctrls = { + std::nullopt, // no controls + OptCtrls(std::in_place, Ctrls()), + OptCtrls(std::in_place, Ctrls({{"algorithm", "tpl"}})), + OptCtrls(std::in_place, Ctrls({{"algorithm", "v4.1"}}))}; + + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { +#if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) + ctrls.push_back(Ctrls({{"algorithm", "experimental_tc"}})); +#if defined(KOKKOS_ARCH_AMPERE) + ctrls.push_back(Ctrls( + {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}})); +#endif // AMPERE +#endif // AMPERE || VOLTA + } +#endif // CUDA + } + + for (const auto &ctrl : ctrls) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spmv(alg, mode, alpha, beta, a, x, y); + test_spmv(ctrl, mode, alpha, beta, a, x, y); } } } @@ -434,9 +465,10 @@ void test_spmv() { template -void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, - const Beta &beta, const Bsr &a, const XVector &x, - const YVector &y) { +void test_spm_mv( + const std::optional &controls, + const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, + const XVector &x, const YVector &y) { using execution_space = typename Bsr::execution_space; using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; @@ -452,10 +484,8 @@ void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, YVector yAct("yAct", y.extent(0), y.extent(1)); Kokkos::deep_copy(yAct, y); - if (alg) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", alg); - KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + if (controls) { + KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); } else { KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); } @@ -503,9 +533,16 @@ void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { + std::string alg; + if (controls) { + alg = controls->getParameter("algorithm", ""); + } else { + alg = ""; + } + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMMV failure!" << std::endl; - std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "alg: " << alg << std::endl; std::cerr << "mode: " << mode << std::endl; std::cerr << "A: " << a.numRows() << "x" << a.numCols() << std::endl; @@ -566,17 +603,40 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, template void test_spm_mv_combos(const char *mode, const Bsr &a) { - using scalar_type = typename Bsr::non_const_value_type; + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + + // cover a variety of controls + using Ctrls = KokkosKernels::Experimental::Controls; + using OptCtrls = std::optional; + std::vector ctrls = { + std::nullopt, // no controls + OptCtrls(std::in_place, Ctrls()), + OptCtrls(std::in_place, Ctrls({{"algorithm", "tpl"}})), + OptCtrls(std::in_place, Ctrls({{"algorithm", "v4.1"}}))}; + + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { +#if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) + ctrls.push_back(Ctrls({{"algorithm", "experimental_tc"}})); +#if defined(KOKKOS_ARCH_AMPERE) + ctrls.push_back(Ctrls( + {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}})); +#endif // AMPERE +#endif // AMPERE || VOLTA + } +#endif // CUDA + } - for (size_t numVecs : {1, 2, 7}) { // num multivecs + for (size_t numVecs : {1, 7}) { // num multivecs auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); - for (auto alg : {(const char *)(nullptr), "native", "experimental_tc", - "v4.1", "v4.2"}) { + for (const auto &ctrl : ctrls) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spm_mv(alg, mode, alpha, beta, a, x, y); + test_spm_mv(ctrl, mode, alpha, beta, a, x, y); } } } From 2d1a0fb710ef06c75cfe2ec81f626e9c3d778add Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Aug 2023 12:52:48 -0600 Subject: [PATCH 106/231] Fix #1891 (HashmapAccumulator data races on Ada and Hopper architectures). To avoid checking for every architecture at every location, add a new macro KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS which is defined if we're targeting an architecture with independent thread scheduling. --- .../KokkosKernels_BlockHashmapAccumulator.hpp | 30 +++++++--------- .../src/KokkosKernels_HashmapAccumulator.hpp | 36 ++++++++----------- common/src/KokkosKernels_Macros.hpp | 9 +++++ .../KokkosSparse_spgemm_impl_compression.hpp | 14 +++----- 4 files changed, 41 insertions(+), 48 deletions(-) diff --git a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp index f275bd007a..3ca160164c 100644 --- a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp @@ -250,9 +250,8 @@ struct BlockHashmapAccumulator { KokkosSparse::Impl::kk_block_set_mul( block_dim, values + my_write_index * block_size, valA, valB); -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -327,9 +326,8 @@ struct BlockHashmapAccumulator { KokkosSparse::Impl::kk_block_set_mul( block_dim, values + my_write_index * block_size, valA, valB); -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -407,9 +405,8 @@ struct BlockHashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -460,9 +457,8 @@ struct BlockHashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -514,9 +510,8 @@ struct BlockHashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -564,9 +559,8 @@ struct BlockHashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done diff --git a/common/src/KokkosKernels_HashmapAccumulator.hpp b/common/src/KokkosKernels_HashmapAccumulator.hpp index 3a12d399f2..1085cec4af 100644 --- a/common/src/KokkosKernels_HashmapAccumulator.hpp +++ b/common/src/KokkosKernels_HashmapAccumulator.hpp @@ -16,6 +16,7 @@ #ifndef _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP #define _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP #include +#include "KokkosKernels_Macros.hpp" #include namespace KokkosKernels { @@ -412,9 +413,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA because warps do not go in SIMD fashion +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ because warps do not go in SIMD fashion // anymore. while some thread might insert my_write_index into linked // list, another thread in the warp might be reading keys in above loop. // before inserting the new value in liked list -- which is done with @@ -483,9 +483,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -601,9 +600,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -679,9 +677,8 @@ struct HashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -732,9 +729,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -786,9 +782,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -836,9 +831,8 @@ struct HashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done diff --git a/common/src/KokkosKernels_Macros.hpp b/common/src/KokkosKernels_Macros.hpp index d7f7af5a79..04234a5ce2 100644 --- a/common/src/KokkosKernels_Macros.hpp +++ b/common/src/KokkosKernels_Macros.hpp @@ -96,4 +96,13 @@ #endif // KOKKOS_COMPILER_GNU /******* END other helper macros *******/ +// define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS if we are targeting a CUDA +// architecture with "independent thread scheduling" (Volta70 and up). This +// requires some extra logic in HashmapAccumulator to avoid data races. +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_ADA89) || \ + defined(KOKKOS_ARCH_HOPPER) +#define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS +#endif + #endif // KOKKOSKERNELS_MACROS_HPP_ diff --git a/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 1e61a66c84..5365970292 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -546,8 +546,7 @@ struct KokkosSPGEMM Date: Tue, 1 Aug 2023 15:33:22 -0600 Subject: [PATCH 107/231] Fix SpAdd perf test when offset/ordinal is not int SpAdd perf test can call cusparse and MKL TPLs directly, but this can only be done if the default ordinal and offset type are both int. Check for this ahead of time and print a decent error message. Fixes build error when cusparse enabled, but offset int not enabled. --- common/src/KokkosKernels_default_types.hpp | 4 +- perf_test/sparse/KokkosSparse_spadd.cpp | 108 ++++++++++++++------- 2 files changed, 77 insertions(+), 35 deletions(-) diff --git a/common/src/KokkosKernels_default_types.hpp b/common/src/KokkosKernels_default_types.hpp index 672bdf3fbb..30ca52e300 100644 --- a/common/src/KokkosKernels_default_types.hpp +++ b/common/src/KokkosKernels_default_types.hpp @@ -25,7 +25,8 @@ using default_lno_t = int; #elif defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) using default_lno_t = int64_t; #else -using default_lno_t = int; +// Non-ETI build: default to int +using default_lno_t = int; #endif // Prefer int as the default offset type, because cuSPARSE doesn't support // size_t for rowptrs. @@ -34,6 +35,7 @@ using default_size_type = int; #elif defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) using default_size_type = size_t; #else +// Non-ETI build: default to int using default_size_type = int; #endif diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index f27d7d93db..3b347eb903 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -185,6 +185,31 @@ void run_experiment(int argc, char** argv, CommonInputParams) { "If running MKL, can't output the result to file"); } + // Check that offset/ordinal types are compatible with any requested TPLs +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (params.use_mkl) { + if constexpr (!std::is_same_v) { + throw std::runtime_error( + "MKL configured with long long int not supported in Kokkos Kernels"); + } + if constexpr (!std::is_same_v || + !std::is_same_v) { + throw std::runtime_error( + "Must enable int as both ordinal and offset type in KokkosKernels to " + "call MKL SpAdd"); + } + } +#endif + + if (params.use_cusparse) { + if constexpr (!std::is_same_v || + !std::is_same_v) { + throw std::runtime_error( + "Must enable int as both ordinal and offset type in KokkosKernels to " + "call cuSPARSE SpAdd"); + } + } + std::cout << "************************************* \n"; crsMat_t A; crsMat_t B; @@ -319,9 +344,11 @@ void run_experiment(int argc, char** argv, CommonInputParams) { } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - sparse_matrix_t Amkl, Bmkl, Cmkl; + sparse_matrix_t Amkl = sparse_matrix_t(), Bmkl = sparse_matrix_t(), + Cmkl = sparse_matrix_t(); if (params.use_mkl) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v && + std::is_same_v) { KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), @@ -330,9 +357,6 @@ void run_experiment(int argc, char** argv, CommonInputParams) { &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())); - } else { - throw std::runtime_error( - "MKL configured with long long int not supported in Kokkos Kernels"); } } #endif @@ -347,22 +371,30 @@ void run_experiment(int argc, char** argv, CommonInputParams) { c_nnz = addHandle->get_c_nnz(); } else if (params.use_cusparse) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // Symbolic phase: compute buffer size, then compute nnz - size_t bufferSize; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2_bufferSizeExt( - cusparseHandle, A.numRows(), A.numCols(), &alphabeta, A_cusparse, - A.nnz(), A.values.data(), A.graph.row_map.data(), - A.graph.entries.data(), &alphabeta, B_cusparse, B.nnz(), - B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), - C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize)); - // Allocate work buffer - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaMalloc((void**)&cusparseBuffer, bufferSize)); - KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz( - cusparseHandle, m, n, A_cusparse, A.nnz(), A.graph.row_map.data(), - A.graph.entries.data(), B_cusparse, B.nnz(), B.graph.row_map.data(), - B.graph.entries.data(), C_cusparse, row_mapC.data(), &c_nnz, - cusparseBuffer)); + if constexpr (std::is_same_v && + std::is_same_v) { + // Symbolic phase: compute buffer size, then compute nnz + size_t bufferSize; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2_bufferSizeExt( + cusparseHandle, A.numRows(), A.numCols(), &alphabeta, A_cusparse, + A.nnz(), A.values.data(), A.graph.row_map.data(), + A.graph.entries.data(), &alphabeta, B_cusparse, B.nnz(), + B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), + C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize)); + // Allocate work buffer + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc((void**)&cusparseBuffer, bufferSize)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz( + cusparseHandle, m, n, A_cusparse, A.nnz(), A.graph.row_map.data(), + A.graph.entries.data(), B_cusparse, B.nnz(), B.graph.row_map.data(), + B.graph.entries.data(), C_cusparse, row_mapC.data(), &c_nnz, + cusparseBuffer)); + } else { + throw std::runtime_error( + "Must enable int as both ordinal and offset type in KokkosKernels " + "to " + "call cuSPARSE"); + } #endif } if (!params.use_mkl) { @@ -381,24 +413,32 @@ void run_experiment(int argc, char** argv, CommonInputParams) { for (int numericRep = 0; numericRep < params.numericRepeat; numericRep++) { if (params.use_cusparse) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2( - cusparseHandle, m, n, &alphabeta, A_cusparse, A.nnz(), - A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), - &alphabeta, B_cusparse, B.nnz(), B.values.data(), - B.graph.row_map.data(), B.graph.entries.data(), C_cusparse, - valuesC.data(), row_mapC.data(), entriesC.data(), cusparseBuffer)); + if constexpr (std::is_same_v && + std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2( + cusparseHandle, m, n, &alphabeta, A_cusparse, A.nnz(), + A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), + &alphabeta, B_cusparse, B.nnz(), B.values.data(), + B.graph.row_map.data(), B.graph.entries.data(), C_cusparse, + valuesC.data(), row_mapC.data(), entriesC.data(), + cusparseBuffer)); + } #endif } else if (params.use_mkl) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_add( - SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); + if constexpr (std::is_same_v && + std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_add( + SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); + } #endif } else { - spadd_numeric( - &kh, A.graph.row_map, A.graph.entries, A.values, 1.0, // A, alpha - B.graph.row_map, B.graph.entries, B.values, 1.0, // B, beta - row_mapC, entriesC, valuesC); // C + spadd_numeric(&kh, A.graph.row_map, A.graph.entries, A.values, + 1.0, // A, alpha + B.graph.row_map, B.graph.entries, B.values, + 1.0, // B, beta + row_mapC, entriesC, valuesC); // C } } numericTime += timer.seconds(); From ecab8a82a60787d7c16be90cf0d36918de61a071 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 1 Aug 2023 17:50:27 -0600 Subject: [PATCH 108/231] Fix cm_generate_makefile --boundscheck (#1926) (insert a space between it and previous flag) --- cm_generate_makefile.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 913b4e67a5..adb1678908 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -799,7 +799,7 @@ cd ${KOKKOS_INSTALL_PATH} echo "" echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J From 8a7a48abfc8a483647c455ee123031fd5d801b6d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 2 Aug 2023 11:05:16 -0600 Subject: [PATCH 109/231] Test_ODE_Newton: Add template parameters for Kokkos::pair MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attempt to resolve "error: ‘pair’ was not declared in this scope" in cuda/11.2+gcc/8.3.0 builds --- ode/unit_test/Test_ODE_Newton.hpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp index da29d895fc..44a299b949 100644 --- a/ode/unit_test/Test_ODE_Newton.hpp +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -49,16 +49,24 @@ struct NewtonSolve_wrapper { void operator()(const int idx) const { // Take subviews to create the local problem auto local_x = Kokkos::subview( - x, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1))); + x, Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1)))); auto local_rhs = Kokkos::subview( - rhs, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1))); + rhs, Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1)))); auto local_update = Kokkos::subview( - update, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1))); + update, + Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1)))); auto local_J = Kokkos::subview( - J, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1)), + J, + Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1))), Kokkos::ALL()); auto local_tmp = Kokkos::subview( - tmp, Kokkos::pair(my_nls.neqs * idx, my_nls.neqs * (idx + 1)), + tmp, + Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1))), Kokkos::ALL()); // Run Newton nonlinear solver From a2cd2ebbd847de396e888b53e86d80d72813ba1e Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 8 Aug 2023 10:30:03 -0600 Subject: [PATCH 110/231] Removal of all printouts in the unit tests of ger() and syr(). Also on ger(): - better explanations on unit_tests, - its unit tests now have terminology similar to syr() unit tests, - replace 'one level' and 'two level' terminology on the impl file, following syr(). Also on syr(): - small improvement on the ROC tpl file --- blas/impl/KokkosBlas2_ger_impl.hpp | 69 +++---- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 2 +- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 25 +-- blas/unit_test/Test_Blas2_ger.hpp | 192 +++++++++++++----- blas/unit_test/Test_Blas2_syr.hpp | 100 +++++++-- 5 files changed, 281 insertions(+), 107 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index fa2220e00a..68a158bd68 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -25,17 +25,17 @@ namespace KokkosBlas { namespace Impl { -// Functor for a single-level parallel_for version of nontranspose GER. -// The functor parallelizes over rows of the input matrix A. +// Functor for the thread parallel version of GER. +// This functor parallelizes over rows of the input matrix A. template -struct SingleLevelGER { +struct ThreadParallelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using YComponentType = typename YViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - SingleLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + ThreadParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } @@ -69,13 +69,13 @@ struct SingleLevelGER { AViewType A_; }; -// Single-level parallel version of GER. +// Thread parallel version of SYR. template -void singleLevelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { +void threadParallelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -90,22 +90,22 @@ void singleLevelGer(const ExecutionSpace& space, const char trans[], } else { Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); - SingleLevelGER functor( + ThreadParallelGER functor( (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::ger[SingleLevel]", rangePolicy, functor); + Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, functor); } } -struct TwoLevelGER_LayoutLeftTag {}; -struct TwoLevelGER_LayoutRightTag {}; +struct TeamParallelGER_LayoutLeftTag {}; +struct TeamParallelGER_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- -// Functor for a two-level parallel_reduce version of GER, designed for -// performance on GPU. Kernel depends on the layout of A. +// Functor for the team parallel version of GER, designed for +// performance on GPU. The kernel depends on the layout of A. template -struct TwoLevelGER { +struct TeamParallelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using YComponentType = typename YViewType::non_const_value_type; @@ -114,15 +114,15 @@ struct TwoLevelGER { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TwoLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + TeamParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutLeftTag, + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutLeftTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do @@ -147,7 +147,7 @@ struct TwoLevelGER { } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutRightTag, + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutRightTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do @@ -169,7 +169,6 @@ struct TwoLevelGER { }); } } - team.team_barrier(); } private: @@ -180,12 +179,12 @@ struct TwoLevelGER { AViewType A_; }; -// Two-level parallel version of GER. +// Team parallel version of SYR. template -void twoLevelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) { +void teamParallelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -205,8 +204,8 @@ void twoLevelGer(const ExecutionSpace& space, const char trans[], constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -217,15 +216,17 @@ void twoLevelGer(const ExecutionSpace& space, const char trans[], teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TwoLevelGER + TeamParallelGER functor((trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::ger[twoLevel]", teamPolicy, functor); + Kokkos::parallel_for("KokkosBlas::ger[teamParallel]", teamPolicy, functor); } // --------------------------------------------------------------------------------------------- -// generalGer: use 1 level (Range) or 2 level (Team) implementation, -// depending on whether execution space is CPU or GPU. +// generalGerImpl(): +// - use thread parallel code (rangePolicy) if execution space is CPU; +// - use team parallel code (teamPolicy) if execution space is GPU. +// // The 'enable_if' makes sure unused kernels are not instantiated. template +template struct ger_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 13e2bd21b1..43b5fe3740 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -22,14 +22,15 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (*uplo == 'L' || *uplo == 'l') ? rocblas_fill_lower \ - : rocblas_fill_upper; +#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower \ + : rocblas_fill_upper; #define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ ETI_SPEC_AVAIL) \ @@ -57,7 +58,7 @@ namespace Impl { typename AViewType::const_value_type& alpha, \ const XViewType& X, const AViewType& A) { \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ @@ -103,7 +104,7 @@ namespace Impl { typename AViewType::const_value_type& alpha, \ const XViewType& X, const AViewType& A) { \ Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ @@ -150,7 +151,7 @@ namespace Impl { const XViewType& X, const AViewType& A) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ if (A_is_ll) { \ @@ -223,7 +224,7 @@ namespace Impl { const XViewType& X, const AViewType& A) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (justTranspose) { \ if (A_is_ll) { \ diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 7e9ed08d88..bf45ba5fc3 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -14,6 +14,37 @@ // //@HEADER +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operation A += alpha * x * y^{T,H}. +// 01) Type of 'x' components: float, double, complex, ... +// 02) Type of 'y' components: float, double, complex, ... +// 03) Type of 'A' components: float, double, complex, ... +// 04) Execution space: serial, threads, OpenMP, Cuda, ... +// 05) Layout of 'x' +// 06) Layout of 'y' +// 07) Layout of 'A' +// 08) Dimension of 'A' +// 09) Options 'const' or 'non const' for x view, when calling ger() +// 10) Options 'const' or 'non const' for y view, when calling ger() +// 11) Usage of analytical results in the tests +// 12) Options 'T' or 'H' when calling ger() +// +// Choices (01)-(04) are selected in the routines TEST_F() at the +// very bottom of the file, when calling test_ger<...>(). +// +// Choices (05)-(12) are selected in routine test_gerr<...>(), +// when calling the method test() of class Test::GerTester<...>. +// +// The class Test::GerTester<...> represents the "core" of the test +// logic, where all calculations, comparisons, and success/failure +// decisions are performed. +// +// A high level explanation of method Test::GerTester<...>::test() +// is given by the 9 steps named "Step 1 of 9" to "Step 9 of 9" +// in the code below. +// ********************************************************************** + #include #include #include @@ -35,18 +66,17 @@ class GerTester { const bool useHermitianOption = false); private: - typedef Kokkos::View _ViewTypeX; - typedef Kokkos::View _ViewTypeY; - typedef Kokkos::View _ViewTypeA; + using _ViewTypeX = Kokkos::View; + using _ViewTypeY = Kokkos::View; + using _ViewTypeA = Kokkos::View; - typedef typename _ViewTypeX::HostMirror _HostViewTypeX; - typedef typename _ViewTypeY::HostMirror _HostViewTypeY; - typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View - _ViewTypeExpected; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; - typedef Kokkos::ArithTraits _KAT_A; - typedef typename _KAT_A::mag_type _AuxType; + using _KAT_A = Kokkos::ArithTraits; + using _AuxType = typename _KAT_A::mag_type; void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, @@ -88,28 +118,28 @@ class GerTester { typename std::enable_if>::value || std::is_same>::value, void>::type - compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template typename std::enable_if>::value && !std::is_same>::value, void>::type - compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template typename std::enable_if>::value || std::is_same>::value, void>::type - compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected); template typename std::enable_if>::value && !std::is_same>::value, void>::type - compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected); template @@ -127,8 +157,8 @@ class GerTester { const bool _A_is_ll; const bool _testIsGpu; const bool _vanillaUsesDifferentOrderOfOps; - const _AuxType _epsAbs; - const _AuxType _epsRel; + const _AuxType _absTol; + const _AuxType _relTol; int _M; int _N; bool _useAnalyticalResults; @@ -154,8 +184,16 @@ GerTester::value ? 1.0e-6 : 1.0e-9), - _epsRel(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + // **************************************************************** + // Tolerances for double can be tighter than tolerances for float. + // + // In the case of calculations with float, a small amount of + // discrepancies between reference results and CUDA results are + // large enough to require 'relTol' to value 5.0e-3. The same + // calculations show no discrepancies for calculations with double. + // **************************************************************** + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -177,6 +215,7 @@ void GerTesterpopulateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); @@ -262,7 +303,7 @@ void GerTestercompareVanillaExpected(alpha, h_vanilla.d_view, h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -323,10 +364,12 @@ void GerTester>::value || std::is_same>::value, void>::type GerTester::compareVanillaExpected(const T& alpha, + Device>::compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * @@ -687,7 +730,7 @@ GerTester diffThreshold) { errorHappened = true; numErrorsRealAbs++; @@ -700,13 +743,14 @@ GerTester diffThreshold) { errorHappened = true; numErrorsRealRel++; } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() @@ -714,12 +758,13 @@ GerTester 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); @@ -815,22 +866,26 @@ GerTester 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1177,7 +1247,7 @@ typename std::enable_if>::value && !std::is_same>::value, void>::type GerTester::compareKokkosExpected(const T& alpha, + Device>::compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * @@ -1196,7 +1266,7 @@ GerTester diffThreshold) { errorHappened = true; numErrorsAbs++; @@ -1209,21 +1279,24 @@ GerTester diffThreshold) { errorHappened = true; numErrorsRel++; } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } } // for j } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() @@ -1241,6 +1314,7 @@ GerTester 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; +#endif { std::ostringstream msg; msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr @@ -1263,7 +1337,9 @@ GerTester 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1278,22 +1354,28 @@ void GerTestercompareKokkosExpected(alpha, h_A, h_expected); + this->compareKkGerAgainstExpected(alpha, h_A, h_expected); } } } // namespace Test template +#ifdef HAVE_KOKKOSKERNELS_DEBUG int test_ger(const std::string& caseName) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", caseName.c_str(), typeid(Device).name()); - +#else +int test_ger(const std::string& /*caseName*/) { +#endif bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || @@ -1340,12 +1425,13 @@ int test_ger(const std::string& caseName) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); - +#endif if (true) { Test::GerTester @@ -1374,22 +1460,25 @@ int test_ger(const std::string& caseName) { } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); - +#endif if (true) { Test::GerTester @@ -1418,21 +1507,24 @@ int test_ger(const std::string& caseName) { } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); - +#endif if (true) { Test::GerTester @@ -1458,21 +1550,24 @@ int test_ger(const std::string& caseName) { } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); - +#endif if (true) { Test::GerTester @@ -1493,18 +1588,21 @@ int test_ger(const std::string& caseName) { tester.test(1024, 1024, 0); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); - +#endif return 1; } @@ -1566,7 +1664,7 @@ TEST_F(TestCategory, ger_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, ger_double_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int_float"); - test_ger("test case ger_mixed_types"); + test_ger("test case ger_double_int_float"); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 83f8a8c175..e7b5e7de3d 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -221,6 +221,7 @@ void SyrTester::test( const int N, const int nonConstConstCombinations, const bool useAnalyticalResults, const bool useHermitianOption, const bool useUpOption) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Entering SyrTester::test()... - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - - - " @@ -236,7 +237,7 @@ void SyrTester::test( << ", useAnalyticalResults = " << useAnalyticalResults << ", useHermitianOption = " << useHermitianOption << ", useUpOption = " << useUpOption << std::endl; - +#endif // ******************************************************************** // Step 1 of 7: declare main types and variables // ******************************************************************** @@ -286,9 +287,11 @@ void SyrTester::test( // ******************************************************************** view_stride_adapter<_ViewTypeExpected, true> h_vanilla( "vanilla = A + alpha * x * x^{t,h}", _M, _N); +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); +#endif this->populateVanillaValues(alpha, x.h_view, A.h_view, h_vanilla.d_view); // ******************************************************************** @@ -348,10 +351,12 @@ void SyrTester::test( EXPECT_ANY_THROW(KokkosBlas::syr("T", "", alpha, x.d_view, A.d_view)) << "Failed test: kk syr should have thrown an exception for uplo ''"; +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Leaving SyrTester::test() - - - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - " << std::endl; +#endif } template ::populateVariables( Kokkos::deep_copy(A, h_A); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { @@ -461,6 +467,7 @@ void SyrTester::populateVariables( } } } +#endif } // Code for complex values @@ -719,6 +726,7 @@ SyrTester:: compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { @@ -728,7 +736,7 @@ SyrTester:: } } } - +#endif int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); @@ -772,6 +780,7 @@ SyrTester:: } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() @@ -779,6 +788,7 @@ SyrTester:: "h_vanilla(i,j).real()) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); @@ -804,6 +814,7 @@ SyrTester:: } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() @@ -811,6 +822,7 @@ SyrTester:: "h_vanilla(i,j).imag()) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } } // for j } // for i @@ -840,7 +852,9 @@ SyrTester:: int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); @@ -870,7 +884,9 @@ SyrTester:: int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); @@ -883,22 +899,26 @@ SyrTester:: for (int j(0); j < _N; ++j) { if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; +#endif } numErrorsReal++; } if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; +#endif } numErrorsImag++; } @@ -939,9 +959,11 @@ SyrTester:: if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) << std::endl; +#endif } } } @@ -984,12 +1006,14 @@ SyrTester:: } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } } // for j } // for i @@ -1018,7 +1042,9 @@ SyrTester:: int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1029,9 +1055,11 @@ SyrTester:: for (int j(0); j < _N; ++j) { if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; +#endif } numErrors++; } @@ -1062,9 +1090,11 @@ SyrTester:: if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) << ", h_A(" << i << "," << j << ")=" << h_A(i, j) << std::endl; +#endif } } } @@ -1110,12 +1140,14 @@ SyrTester:: } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).real() = " << h_reference(i, j).real() << ", h_A(i,j).real() = " << h_A(i, j).real() << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); @@ -1141,16 +1173,19 @@ SyrTester:: } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() << ", h_A(i,j).imag() = " << h_A(i, j).imag() << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } } // for j } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll @@ -1206,7 +1241,7 @@ SyrTester:: << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " << h_A(710, 1065).imag() << ")" << std::endl; } - +#endif { std::ostringstream msg; msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr @@ -1232,7 +1267,9 @@ SyrTester:: int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1261,7 +1298,9 @@ SyrTester:: int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1277,6 +1316,7 @@ typename std::enable_if>::value && SyrTester:: compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { @@ -1286,7 +1326,7 @@ SyrTester:: } } } - +#endif int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); @@ -1323,14 +1363,17 @@ SyrTester:: } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j) = " << h_reference(i, j) << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } } // for j } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() @@ -1349,6 +1392,7 @@ SyrTester:: << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; +#endif { std::ostringstream msg; msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr @@ -1371,7 +1415,9 @@ SyrTester:: int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1385,12 +1431,14 @@ void SyrTester:: _ViewTypeA& A, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected, const std::string& situation) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); +#endif std::string mode = _useHermitianOption ? "H" : "T"; std::string uplo = _useUpOption ? "U" : "L"; bool gotStdException(false); @@ -1398,12 +1446,16 @@ void SyrTester:: try { KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A); } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "': caught exception, e.what() = " << e.what() << std::endl; +#endif gotStdException = true; } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "': caught unknown exception" << std::endl; +#endif gotUnknownException = true; } @@ -1436,25 +1488,31 @@ void SyrTester:: // ******************************************************************** // Call ger() // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException); +#endif std::string mode = _useHermitianOption ? "H" : "T"; bool gotStdException(false); bool gotUnknownException(false); try { KokkosBlas::ger(mode.c_str(), alpha, x, x, A_ger.d_view); } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught exception, e.what() = " << e.what() << std::endl; +#endif gotStdException = true; } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught unknown exception" << std::endl; +#endif gotUnknownException = true; } @@ -1501,12 +1559,15 @@ void SyrTester:: } // namespace Test template +#ifdef HAVE_KOKKOSKERNELS_DEBUG int test_syr(const std::string& caseName) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); - +#else +int test_syr(const std::string& /*caseName*/) { +#endif bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || @@ -1520,12 +1581,13 @@ int test_syr(const std::string& caseName) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); - +#endif if (true) { Test::SyrTester @@ -1554,22 +1616,25 @@ int test_syr(const std::string& caseName) { tester.test(2131, 0); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); - +#endif if (true) { Test::SyrTester @@ -1598,22 +1663,25 @@ int test_syr(const std::string& caseName) { tester.test(2131, 0); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); - +#endif if (true) { Test::SyrTester @@ -1642,21 +1710,24 @@ int test_syr(const std::string& caseName) { tester.test(2131, 0); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); - +#endif if (true) { Test::SyrTester @@ -1683,18 +1754,21 @@ int test_syr(const std::string& caseName) { tester.test(1024, 0); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); #endif +#endif +#ifdef HAVE_KOKKOSKERNELS_DEBUG KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); - +#endif return 1; } @@ -1752,9 +1826,9 @@ TEST_F(TestCategory, syr_int) { #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, syr_double_int) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double_int"); - test_syr("test case syr_double_int"); +TEST_F(TestCategory, syr_int_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int_float"); + test_syr("test case syr_int_float"); Kokkos::Profiling::popRegion(); } #endif From 6399f07d08542c5937d0cfa7f62e7a08e60cb417 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 8 Aug 2023 10:42:15 -0600 Subject: [PATCH 111/231] Formatting --- blas/impl/KokkosBlas2_ger_impl.hpp | 6 ++-- blas/unit_test/Test_Blas2_ger.hpp | 52 ++++++++++++++++-------------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index 68a158bd68..b9e0b548d4 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -92,7 +92,8 @@ void threadParallelGer(const ExecutionSpace& space, const char trans[], A.extent(0)); ThreadParallelGER functor( (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, functor); + Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, + functor); } } @@ -184,7 +185,8 @@ template void teamParallelGer(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) { + const XViewType& x, const YViewType& y, + const AViewType& A) { static_assert(std::is_integral::value, "IndexType must be an integer"); diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index bf45ba5fc3..6bf44f98f8 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -73,9 +73,10 @@ class GerTester { using _HostViewTypeX = typename _ViewTypeX::HostMirror; using _HostViewTypeY = typename _ViewTypeY::HostMirror; using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = Kokkos::View; + using _ViewTypeExpected = + Kokkos::View; - using _KAT_A = Kokkos::ArithTraits; + using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, @@ -118,29 +119,31 @@ class GerTester { typename std::enable_if>::value || std::is_same>::value, void>::type - compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected); + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template typename std::enable_if>::value && !std::is_same>::value, void>::type - compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected); + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template typename std::enable_if>::value || std::is_same>::value, void>::type compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + const _ViewTypeExpected& h_expected); template typename std::enable_if>::value && !std::is_same>::value, void>::type compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + const _ViewTypeExpected& h_expected); template T shrinkAngleToZeroTwoPiRange(const T input); @@ -303,7 +306,8 @@ void GerTestercompareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, + h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -703,10 +707,10 @@ template typename std::enable_if>::value || std::is_same>::value, void>::type -GerTester::compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +GerTester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); @@ -917,10 +921,10 @@ template typename std::enable_if>::value && !std::is_same>::value, void>::type -GerTester::compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +GerTester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); @@ -1034,10 +1038,9 @@ template typename std::enable_if>::value || std::is_same>::value, void>::type -GerTester::compareKkGerAgainstExpected(const T& alpha, - const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected) { +GerTester:: + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); @@ -1246,10 +1249,9 @@ template typename std::enable_if>::value && !std::is_same>::value, void>::type -GerTester::compareKkGerAgainstExpected(const T& alpha, - const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected) { +GerTester:: + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); From efd7f4adf4a091b905e7809d33a57c03a5de8d51 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 8 Aug 2023 10:51:38 -0600 Subject: [PATCH 112/231] More formatting --- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 43b5fe3740..cf02e9e207 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -22,14 +22,14 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? rocblas_fill_lower \ +#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower \ : rocblas_fill_upper; #define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ From 0a8b20f815ac8de26117c4c9efa1fb1798e38ecd Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 8 Aug 2023 12:44:59 -0600 Subject: [PATCH 113/231] Fix build. Add fall-back overload --- common/src/KokkosKernels_ExecSpaceUtils.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index a30b2e777d..2a1cd5e6e0 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -146,6 +146,17 @@ kk_is_a64fx_mem_space() { } #endif // a64fx architectures +// Host function to determine free and total device memory. +// Will throw if execution space doesn't support this. +template +inline void kk_get_free_total_memory(size_t& /* free_mem */, + size_t& /* total_mem */) { + std::ostringstream oss; + oss << "Error: memory space " << MemorySpace::name() + << " does not support querying free/total memory."; + throw std::runtime_error(oss.str()); +} + // Host function to determine free and total device memory. // Will throw if execution space doesn't support this. template From be51f4e9f062847d6714268a6e96766bd951ebf5 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 8 Aug 2023 13:07:36 -0600 Subject: [PATCH 114/231] Two minor typos --- blas/impl/KokkosBlas2_ger_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index b9e0b548d4..651db7f11a 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -69,7 +69,7 @@ struct ThreadParallelGER { AViewType A_; }; -// Thread parallel version of SYR. +// Thread parallel version of GER. template void threadParallelGer(const ExecutionSpace& space, const char trans[], @@ -180,7 +180,7 @@ struct TeamParallelGER { AViewType A_; }; -// Team parallel version of SYR. +// Team parallel version of GER. template void teamParallelGer(const ExecutionSpace& space, const char trans[], From ad6161c96d16560ae640cf7cedc0f840474bef13 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 11:46:02 -0600 Subject: [PATCH 115/231] sparse/unit_test: Add PSGS stream tests --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 295 +++++++++++++++--- 1 file changed, 243 insertions(+), 52 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 358205b713..260bac8a83 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -142,6 +142,56 @@ void run_gauss_seidel( kh.destroy_gs_handle(); } +template +void run_gauss_seidel_streams( + std::vector kh, std::vector input_mat, + std::vector x_vector, std::vector y_vector, + bool is_symmetric_graph, typename crsMat_t::value_type omega, + int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. + int nstreams = 1) { + for (int i = 0; i < nstreams; i++) { + gauss_seidel_symbolic(&kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, is_symmetric_graph); + gauss_seidel_numeric(&kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, is_symmetric_graph); + } + + const int apply_count = 2; + for (int i = 0; i < nstreams; i++) { + switch (apply_type) { + case 0: + symmetric_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + case 1: + forward_sweep_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + case 2: + backward_sweep_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + default: + symmetric_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + } + } +} } // namespace Test template (2000, 2000 * 20, \ - 200, 10, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank1(2000, 2000 * 20, \ - 200, 10, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_empty(); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_balloon_clustering(5000, 100, 2000); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_sequential_sor(1000, 1000 * 15, 50, \ - 10); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_long_rows(500, 10, 20, \ - true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_custom_coloring(500, \ - 10); \ +template +void test_gauss_seidel_streams_rank1( + lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, + bool symmetric, double omega, + KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, + int nstreams = 1) { + using namespace Test; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using mag_t = typename Kokkos::ArithTraits::mag_type; + using execution_space = typename device::execution_space; + + using const_size_type = const size_type; + using const_lno_t = const lno_t; + using const_scalar_t = const scalar_t; + using KernelHandle = + KokkosKernelsHandle; + srand(245); + lno_t numCols = numRows; + typename crsMat_t::value_type m_omega = omega; + + std::vector instances; + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector input_mat_v(nstreams); + std::vector solution_x_v(nstreams); + std::vector x_vector_v(nstreams); + std::vector y_vector_v(nstreams); + std::vector initial_norm_res_v(nstreams); + + const scalar_t one = Kokkos::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); + + for (int i = 0; i < nstreams; i++) { + input_mat_v[i] = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); + + if (symmetric) { + // Symmetrize on host, rather than relying on the parallel versions (those + // can be tested for symmetric=false) + input_mat_v[i] = + Test::symmetrize( + input_mat_v[i]); + } + lno_t nv = input_mat_v[i].numRows(); + scalar_view_t solution_x_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); + solution_x_v[i] = solution_x_tmp; + create_random_x_vector(solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); + y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); + // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the + // behavior of each algorithm _should be_ the same on every execution space, + // which is why we just test GS_DEFAULT. + + scalar_view_t x_vector_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); + x_vector_v[i] = x_vector_tmp; + + kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); + } + + int apply_count = 3; // test symmetric, forward, backward + //*** Point-coloring version **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + Kokkos::Timer timer1; + + for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(x_vector_v[i], zero); + + run_gauss_seidel_streams(kh_v, input_mat_v, x_vector_v, y_vector_v, + symmetric, m_omega, apply_type, nstreams); + // double gs = timer1.seconds(); + // KokkosKernels::Impl::print_1Dview(x_vector); + } + + // Check result + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(one, solution_x_v[i], -one, x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(x_vector_v[i]); + std::string info = "on stream_idx: " + std::to_string(i); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + } +} + +#if 0 + lno_t numRows, + size_type nnz, + lno_t bandwidth, + lno_t row_size_variance, + bool symmetric, + double omega, + KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, + int nstreams = 1 +#endif +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_empty(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_balloon_clustering(5000, 100, 2000); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_sequential_sor(1000, 1000 * 15, 50, \ + 10); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_long_rows(500, 10, 20, \ + true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_custom_coloring(500, \ + 10); \ } #include From d9c5ec6f1e1818efe7e6990a217cb592cca02c15 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 12:12:42 -0600 Subject: [PATCH 116/231] Fix docs build --- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 447d96d2a1..134a100cc7 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -652,7 +652,7 @@ class TwoStageGaussSeidelHandle /** * @brief Construct a new Two Stage Gauss Seidel Handle object * - * @param gsh The GaussSeidel handle. + * @param gs_handle The GaussSeidel handle. */ TwoStageGaussSeidelHandle(GSHandle gs_handle) : GSHandle(gs_handle), From 9a7775a2998cfeff07fc930a6481faca1d0eb19b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 12:15:03 -0600 Subject: [PATCH 117/231] Move print statements to avoid timing them --- perf_test/sparse/KokkosSparse_gs.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 119941cebc..2a8b164219 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -220,21 +220,17 @@ void runGS(const GS_Parameters& params) { &kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric); double symbolicLaunchTime = timer.seconds(); - std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double symbolicComputeTime = timer.seconds(); - std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; timer.reset(); KokkosSparse::Experimental::gauss_seidel_numeric( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric); double numericLaunchTime = timer.seconds(); - std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double numericComputeTime = timer.seconds(); - std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; timer.reset(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { @@ -256,11 +252,9 @@ void runGS(const GS_Parameters& params) { } double applyLaunchTime = timer.seconds(); - std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double applyComputeTime = timer.seconds(); - std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; timer.reset(); kh.destroy_gs_handle(); // Now, compute the 2-norm of residual @@ -271,6 +265,12 @@ void runGS(const GS_Parameters& params) { KokkosSparse::spmv("N", alpha, A, x, beta, res); double resnorm = KokkosBlas::nrm2(res); + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; // note: this still works if the solution diverges std::cout << "Relative res norm: " << resnorm / bnorm << '\n'; } From 423da45e0b3d707a00f3e233a28cff9897b8a7e2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 12:53:17 -0600 Subject: [PATCH 118/231] common/src: Correct kk_get_free_total_memory --- common/src/KokkosKernels_ExecSpaceUtils.hpp | 60 +++++++++++++++++---- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index 2a1cd5e6e0..eb629f9e0c 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -173,32 +173,53 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */, template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, - int n_streams = 1) { + int n_streams) { cudaMemGetInfo(&free_mem, &total_mem); free_mem /= n_streams; total_mem /= n_streams; } template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); +} +template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, - int n_streams = 1) { - kk_get_free_total_memory(free_mem, total_mem, n_streams); + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { - kk_get_free_total_memory(free_mem, total_mem, n_streams); + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); +} +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif #ifdef KOKKOS_ENABLE_HIP template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); free_mem /= n_streams; total_mem /= n_streams; } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, + 1); +} #endif // FIXME_SYCL Use compiler extension instead of low level interface when @@ -206,7 +227,7 @@ inline void kk_get_free_total_memory( #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { sycl::queue queue; sycl::device device = queue.get_device(); auto level_zero_handle = @@ -242,19 +263,40 @@ inline void kk_get_free_total_memory( total_mem /= n_streams; } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory( free_mem, total_mem, n_streams); } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory( free_mem, total_mem, n_streams); } + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} #endif template From c1129fcf64b23282aefe984384b12e73a4b22c4d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 10:55:30 -0600 Subject: [PATCH 119/231] docs: Note which build has eti disabled --- docs/developer/testing_table.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/developer/testing_table.rst b/docs/developer/testing_table.rst index 4b9600fa04..a2ec29adf8 100644 --- a/docs/developer/testing_table.rst +++ b/docs/developer/testing_table.rst @@ -14,6 +14,7 @@ The following is a description of abbreviations used throughout the testing tabl * REL: CMake release build type * DBG: CMake debug build type * BCHK: Kokkos core bounds checking +* NOETI: No default ETI types included * UVM: Unified Memory (Cuda) The following is a description of column headings in the testing table. @@ -107,7 +108,7 @@ The following is a description of column headings in the testing table. * int, `size_t` * LayoutLeft, LayoutRight - * * `PR_SKX_GNU1020_OPENMP_LEFT_REL` + * * `PR_SKX_GNU1020_OPENMP_LEFT_REL_NOETI` * Skx * GNU 10.2.0 * OpenMP From e21ecd557eae4de6aa32ad63c416ba70f7ceff87 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 16:00:12 -0600 Subject: [PATCH 120/231] docs: Improve formatting --- batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 4f62d0b0d4..4725e0220d 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -36,14 +36,17 @@ namespace KokkosBatched { /// C = alpha * op(A) * op(B) + beta * C /// /// \tparam ArgTransA Specifies what op does to A: +/// /// Trans::NoTranspose for non-transpose /// Trans::Transpose for transpose /// Trans::ConjTranspose for conjugate transpose /// \tparam ArgTransB Specifies what op does to B: +/// /// Trans::NoTranspose for non-transpose /// Trans::Transpose for transpose /// Trans::ConjTranspose for conjugate transpose /// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// /// AViewType, BViewType, and CViewType: /// BatchLayout::Left Batch dimension is leftmost /// BatchLayout::Right Batch dimension is rightmost @@ -61,13 +64,16 @@ namespace KokkosBatched { /// See struct BatchedGemmHandle for details. /// \param alpha [in] Input coefficient used for multiplication with A /// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// /// If ArgBatchSzDim == "BatchLayout::Right", matrix A is MxKxB /// If ArgBatchSzDim == "BatchLayout::Left", matrix A is BxMxK /// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// /// If ArgBatchSzDim == "BatchLayout::Right", matrix B is KxNxB /// If ArgBatchSzDim == "BatchLayout::Left", matrix B is BxKxN /// \param beta [in] Input coefficient used for multiplication with C /// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// /// If ArgBatchSzDim == "BatchLayout::Right", matrix C is MxNxB /// If ArgBatchSzDim == "BatchLayout::Left", matrix C is BxMxN /// \return 0 upon success, non-zero otherwise From 1f09309ea1d0429945a1e61f53edf2562ec9c207 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 14 Aug 2023 08:53:24 -0400 Subject: [PATCH 121/231] Update SYCL docker image to Cuda 11.7.1 --- scripts/docker/Dockerfile.sycl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 4e185f4c1b..714461bfe6 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:11.7.0-devel-ubuntu22.04 +ARG BASE=nvidia/cuda:11.7.1-devel-ubuntu22.04 FROM $BASE RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub From 30a2e3f92dd6e98954c1275ca498aaefc7b00d5a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 14 Aug 2023 13:32:04 +0000 Subject: [PATCH 122/231] Avoid enum without underlying type to fix SYCL --- ode/src/KokkosODE_Types.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp index 068c4b17ed..7d78227526 100644 --- a/ode/src/KokkosODE_Types.hpp +++ b/ode/src/KokkosODE_Types.hpp @@ -51,7 +51,11 @@ struct ODE_params { min_step_size(min_step_size_) {} }; -enum newton_solver_status { NLS_SUCCESS = 0, MAX_ITER = 1, LIN_SOLVE_FAIL = 2 }; +enum newton_solver_status : int { + NLS_SUCCESS = 0, + MAX_ITER = 1, + LIN_SOLVE_FAIL = 2 +}; struct Newton_params { int max_iters; From 2a855d4d3de272748c742822ce31e8bfa561e81f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 14 Aug 2023 13:29:37 -0600 Subject: [PATCH 123/231] .github/workflows/docs.yml: Use up-to-date doxygen version --- .github/workflows/docs.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2488790254..33f5f48e39 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -11,15 +11,16 @@ permissions: jobs: docs-check: - runs-on: ubuntu-latest + runs-on: [macos-latest] steps: - name: Install Dependencies run: | - sudo apt-get update - sudo apt-get install --no-install-recommends doxygen-latex - pip install sphinx - pip install breathe - pip install sphinx-rtd-theme + brew install doxygen + python3 -m pip install sphinx + python3 -m pip install breathe + python3 -m pip install sphinx-rtd-theme + sphinx-build --version + doxygen --version - name: checkout_kokkos_kernels uses: actions/checkout@v3 From 048dc77db266d3767491bbbfae0e2eaf9b0eded3 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 16 Aug 2023 12:31:43 -0600 Subject: [PATCH 124/231] Unit-Test: adding specific test for block sparse functions This basically adds one more executable per backend that will run all our block sparse unit-test and thus reduce the pressure on the sparse unit-test timeout. --- sparse/unit_test/CMakeLists.txt | 56 +++++++++++++++++++ sparse/unit_test/Test_BlockSparse.hpp | 24 ++++++++ sparse/unit_test/Test_Sparse.hpp | 4 -- .../backends/Test_Cuda_BlockSparse.cpp | 22 ++++++++ .../backends/Test_HIP_BlockSparse.cpp | 22 ++++++++ .../Test_OpenMPTarget_BlockSparse.cpp | 22 ++++++++ .../backends/Test_OpenMP_BlockSparse.cpp | 22 ++++++++ .../backends/Test_SYCL_BlockSparse.cpp | 22 ++++++++ .../backends/Test_Serial_BlockSparse.cpp | 22 ++++++++ .../backends/Test_Threads_BlockSparse.cpp | 22 ++++++++ 10 files changed, 234 insertions(+), 4 deletions(-) create mode 100644 sparse/unit_test/Test_BlockSparse.hpp create mode 100644 sparse/unit_test/backends/Test_Cuda_BlockSparse.cpp create mode 100644 sparse/unit_test/backends/Test_HIP_BlockSparse.cpp create mode 100644 sparse/unit_test/backends/Test_OpenMPTarget_BlockSparse.cpp create mode 100644 sparse/unit_test/backends/Test_OpenMP_BlockSparse.cpp create mode 100644 sparse/unit_test/backends/Test_SYCL_BlockSparse.cpp create mode 100644 sparse/unit_test/backends/Test_Serial_BlockSparse.cpp create mode 100644 sparse/unit_test/backends/Test_Threads_BlockSparse.cpp diff --git a/sparse/unit_test/CMakeLists.txt b/sparse/unit_test/CMakeLists.txt index 745df8992f..d591944675 100644 --- a/sparse/unit_test/CMakeLists.txt +++ b/sparse/unit_test/CMakeLists.txt @@ -24,6 +24,14 @@ IF (KOKKOS_ENABLE_CUDA) backends/Test_Cuda_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_cuda + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Cuda_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_HIP) @@ -34,6 +42,14 @@ IF (KOKKOS_ENABLE_HIP) backends/Test_HIP_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_hip + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_HIP_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_SYCL) @@ -44,6 +60,14 @@ IF (KOKKOS_ENABLE_SYCL) backends/Test_SYCL_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_sycl + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_SYCL_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_OPENMPTARGET) @@ -54,6 +78,14 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) # backends/Test_OpenMPTarget_Sparse.cpp # COMPONENTS sparse # ) + + # KOKKOSKERNELS_ADD_UNIT_TEST( + # blocksparse_openmptarget + # SOURCES + # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + # backends/Test_OpenMPTarget_BlockSparse.cpp + # COMPONENTS sparse + # ) ENDIF () @@ -71,6 +103,14 @@ IF (KOKKOS_ENABLE_SERIAL) backends/Test_Serial_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_serial + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Serial_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_OPENMP) @@ -81,6 +121,14 @@ IF (KOKKOS_ENABLE_OPENMP) backends/Test_OpenMP_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_openmp + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_OpenMP_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_THREADS) @@ -91,5 +139,13 @@ IF (KOKKOS_ENABLE_THREADS) backends/Test_Threads_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_threads + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Threads_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () diff --git a/sparse/unit_test/Test_BlockSparse.hpp b/sparse/unit_test/Test_BlockSparse.hpp new file mode 100644 index 0000000000..70f5108e40 --- /dev/null +++ b/sparse/unit_test/Test_BlockSparse.hpp @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_BLOCKSPARSE_HPP +#define TEST_BLOCKSPARSE_HPP + +#include "Test_Sparse_block_gauss_seidel.hpp" +#include "Test_Sparse_BsrMatrix.hpp" +#include "Test_Sparse_bspgemm.hpp" +#include "Test_Sparse_spmv_bsr.hpp" + +#endif // TEST_BLOCKSPARSE_HPP diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index e10bac740d..2eb9f6f122 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -20,10 +20,8 @@ #include "Test_Sparse_coo2crs.hpp" #endif // KOKKOS_VERSION >= 40099 #include "Test_Sparse_crs2coo.hpp" -#include "Test_Sparse_block_gauss_seidel.hpp" #include "Test_Sparse_Controls.hpp" #include "Test_Sparse_CrsMatrix.hpp" -#include "Test_Sparse_BsrMatrix.hpp" #include "Test_Sparse_mdf.hpp" #include "Test_Sparse_findRelOffset.hpp" #include "Test_Sparse_gauss_seidel.hpp" @@ -33,11 +31,9 @@ #include "Test_Sparse_spadd.hpp" #include "Test_Sparse_spgemm_jacobi.hpp" #include "Test_Sparse_spgemm.hpp" -#include "Test_Sparse_bspgemm.hpp" #include "Test_Sparse_SortCrs.hpp" #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" -#include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" #include "Test_Sparse_par_ilut.hpp" diff --git a/sparse/unit_test/backends/Test_Cuda_BlockSparse.cpp b/sparse/unit_test/backends/Test_Cuda_BlockSparse.cpp new file mode 100644 index 0000000000..d5c73f48d0 --- /dev/null +++ b/sparse/unit_test/backends/Test_Cuda_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_CUDA_BLOCKSPARSE_CPP +#define TEST_CUDA_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_CUDA_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_HIP_BlockSparse.cpp b/sparse/unit_test/backends/Test_HIP_BlockSparse.cpp new file mode 100644 index 0000000000..f040cbf2de --- /dev/null +++ b/sparse/unit_test/backends/Test_HIP_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_HIP_BLOCKSPARSE_CPP +#define TEST_HIP_BLOCKSPARSE_CPP + +#include "Test_HIP.hpp" +#include "Test_BlockSparse.hpp" + +#endif // TEST_HIP_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_OpenMPTarget_BlockSparse.cpp b/sparse/unit_test/backends/Test_OpenMPTarget_BlockSparse.cpp new file mode 100644 index 0000000000..7ea1bcf3f7 --- /dev/null +++ b/sparse/unit_test/backends/Test_OpenMPTarget_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMPTARGET_BLOCKSPARSE_CPP +#define TEST_OPENMPTARGET_BLOCKSPARSE_CPP + +#include "Test_OpenMPTarget.hpp" +#include "Test_BlockSparse.hpp" + +#endif // TEST_OPENMPTARGET_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_OpenMP_BlockSparse.cpp b/sparse/unit_test/backends/Test_OpenMP_BlockSparse.cpp new file mode 100644 index 0000000000..739ccf0a59 --- /dev/null +++ b/sparse/unit_test/backends/Test_OpenMP_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMP_BLOCKSPARSE_CPP +#define TEST_OPENMP_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_OPENMP_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_SYCL_BlockSparse.cpp b/sparse/unit_test/backends/Test_SYCL_BlockSparse.cpp new file mode 100644 index 0000000000..3f80795f9f --- /dev/null +++ b/sparse/unit_test/backends/Test_SYCL_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SYCL_BLOCKSPARSE_CPP +#define TEST_SYCL_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_SYCL_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_Serial_BlockSparse.cpp b/sparse/unit_test/backends/Test_Serial_BlockSparse.cpp new file mode 100644 index 0000000000..69194c0669 --- /dev/null +++ b/sparse/unit_test/backends/Test_Serial_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SERIAL_BLOCKSPARSE_CPP +#define TEST_SERIAL_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_SERIAL_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_Threads_BlockSparse.cpp b/sparse/unit_test/backends/Test_Threads_BlockSparse.cpp new file mode 100644 index 0000000000..8ec1c442f6 --- /dev/null +++ b/sparse/unit_test/backends/Test_Threads_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_THREADS_BLOCKSPARSE_CPP +#define TEST_THREADS_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_THREADS_BLOCKSPARSE_CPP From 0c72f72dd41bfbc4e4766a06eaeaa0a4490c22aa Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 16 Aug 2023 12:36:45 -0600 Subject: [PATCH 125/231] Update sparse/unit_test/Test_BlockSparse.hpp fix clang-format issue --- sparse/unit_test/Test_BlockSparse.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_BlockSparse.hpp b/sparse/unit_test/Test_BlockSparse.hpp index 70f5108e40..b0dd87c5ed 100644 --- a/sparse/unit_test/Test_BlockSparse.hpp +++ b/sparse/unit_test/Test_BlockSparse.hpp @@ -21,4 +21,4 @@ #include "Test_Sparse_bspgemm.hpp" #include "Test_Sparse_spmv_bsr.hpp" -#endif // TEST_BLOCKSPARSE_HPP +#endif // TEST_BLOCKSPARSE_HPP From 61efa268d775af3e1f8cfee150919679dc27e048 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 17 Aug 2023 23:47:35 -0600 Subject: [PATCH 126/231] Speed up BSR spmv tests (#1945) Rearrange loops, reuse BSR->CRS conversions and transposes to speed up BSR spmv, spm_mv unit tests. The test problems and cases are not changed. --- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 227 ++++++++++++---------- 1 file changed, 127 insertions(+), 100 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index f39eb407c6..839421d916 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -84,6 +84,23 @@ inline bool mode_is_transpose(const char *mode) { return mode[0] == 'T' || mode[0] == 'H'; } +/*! \brief Get the max nonzeros (not max nonzero _blocks_) per row of Op(A) */ +template +inline size_t opMaxNnzPerRow(const Bsr &A, bool trans) { + if (trans) { + auto At = KokkosSparse::Impl::transpose_bsr_matrix(A); + return At.blockDim() * + (size_t)KokkosSparse::Impl::graph_max_degree< + typename Bsr::execution_space, typename Bsr::ordinal_type>( + At.graph.row_map); + } else { + return A.blockDim() * + (size_t)KokkosSparse::Impl::graph_max_degree< + typename Bsr::execution_space, typename Bsr::ordinal_type>( + A.graph.row_map); + } +} + /*! \brief 0x0 matrix */ template Bsr bsr_corner_case_0_by_0(const int blockSize) { @@ -128,41 +145,25 @@ Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { return KokkosSparse::Impl::expand_crs_to_bsr(crs, blockSize); } -/*! \brief reference SpMV is the KokkosSparse::spmv on the equivalent point - * matrix - */ -template -void reference_spmv(const char *mode, const Alpha &alpha, const Bsr &a, - const XVector &x, const Beta &beta, const YVector &y) { - using Crs = KokkosSparse::CrsMatrix< - typename Bsr::non_const_value_type, typename Bsr::non_const_ordinal_type, - typename Bsr::device_type, void, typename Bsr::non_const_size_type>; - const Crs crs = KokkosSparse::Impl::bsr_to_crs(a); - - KokkosSparse::spmv(mode, alpha, crs, x, beta, y); -} - /*! \brief test a specific spmv */ -template void test_spmv( const std::optional &controls, const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, - const XVector &x, const YVector &y) { - using execution_space = typename Bsr::execution_space; - using scalar_type = typename Bsr::non_const_value_type; - using ordinal_type = typename Bsr::non_const_ordinal_type; - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; + const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; - // generate expected result from reference implementation + // generate expected result from reference (CRS) implementation YVector yExp("yExp", y.extent(0)); Kokkos::deep_copy(yExp, y); - reference_spmv(mode, alpha, a, x, beta, yExp); + KokkosSparse::spmv(mode, alpha, acrs, x, beta, yExp); // scratch space for actual value (don't modify input) YVector yAct("yAct", y.extent(0)); @@ -180,23 +181,6 @@ void test_spmv( Kokkos::deep_copy(hyExp, yExp); Kokkos::deep_copy(hyAct, yAct); - // max nnz per row is used for the tolerance - // for a transposed computation, need to transpose the matrix before - // seeing which rows are longest - size_t maxNnzPerRow; - if (mode_is_transpose(mode)) { - auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); - maxNnzPerRow = - at.blockDim() * - KokkosSparse::Impl::graph_max_degree( - at.graph.row_map); - } else { - maxNnzPerRow = - a.blockDim() * - KokkosSparse::Impl::graph_max_degree( - a.graph.row_map); - } - /* assume that any floating-point op may introduce eps() error scaling y is one op dot product of x is two ops per entry (mul and add) @@ -375,8 +359,9 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a) { /*! \brief test all combos of the provided matrix */ -template -void test_spmv_combos(const char *mode, const Bsr &a) { +template +void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow) { using scalar_type = typename Bsr::non_const_value_type; using execution_space = typename Bsr::execution_space; @@ -410,7 +395,7 @@ void test_spmv_combos(const char *mode, const Bsr &a) { {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spmv(ctrl, mode, alpha, beta, a, x, y); + test_spmv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } @@ -422,11 +407,24 @@ template void test_spmv_corner_cases() { using Bsr = KokkosSparse::Experimental::BsrMatrix; + using Crs = KokkosSparse::CrsMatrix; for (auto mode : {"N", "T", "C", "H"}) { for (int bs : {1, 2, 5, 9}) { - test_spmv_combos(mode, bsr_corner_case_0_by_0(bs)); - test_spmv_combos(mode, bsr_corner_case_0_by_1(bs)); - test_spmv_combos(mode, bsr_corner_case_1_by_0(bs)); + { + auto A = bsr_corner_case_0_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spmv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_0_by_1(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spmv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_1_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spmv_combos(mode, A, Acrs, 0); + } } } } @@ -435,21 +433,37 @@ template void test_spmv_random() { using Bsr = KokkosSparse::Experimental::BsrMatrix; - for (auto mode : {"N", "T", "C", "H"}) { + using Crs = KokkosSparse::CrsMatrix; + // thoroughly test smaller matrices + std::vector> shapes = {{10, 10}, {10, 50}, {50, 10}}; + for (auto &shape : shapes) { for (int bs : {1, 2, 5, 9}) { - test_spmv_combos(mode, bsr_random(bs, 10, 10)); - test_spmv_combos(mode, bsr_random(bs, 10, 50)); - test_spmv_combos(mode, bsr_random(bs, 50, 10)); + auto A = bsr_random(bs, shape.first, shape.second); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T", "C", "H"}) { + test_spmv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } // test a tougher case on a big matrix - constexpr int blockSizePrime = 7; - constexpr int smallPrime = 11; - constexpr int largePrime = 499; - for (auto mode : {"N", "T"}) { - test_spmv_combos(mode, - bsr_random(blockSizePrime, smallPrime, largePrime)); + { + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + auto A = bsr_random(blockSizePrime, smallPrime, largePrime); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T"}) { + test_spmv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } @@ -463,22 +477,23 @@ void test_spmv() { // Multivector // ---------------------------------------------------------------------------- -template +// Note: if mode_is_transpose(mode), then maxNnzPerRow is for A^T. Otherwise, +// it's for A. +template void test_spm_mv( const std::optional &controls, const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, - const XVector &x, const YVector &y) { - using execution_space = typename Bsr::execution_space; - using scalar_type = typename Bsr::non_const_value_type; - using ordinal_type = typename Bsr::non_const_ordinal_type; - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; + const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; - // generate expected result from reference implementation + // generate expected result from reference (CRS) implementation YVector yExp("yExp", y.extent(0), y.extent(1)); Kokkos::deep_copy(yExp, y); - reference_spmv(mode, alpha, a, x, beta, yExp); + KokkosSparse::spmv(mode, alpha, acrs, x, beta, yExp); // scratch space for actual value (don't modify input) YVector yAct("yAct", y.extent(0), y.extent(1)); @@ -496,23 +511,6 @@ void test_spm_mv( Kokkos::deep_copy(hyExp, yExp); Kokkos::deep_copy(hyAct, yAct); - // max nnz per row is used for the tolerance - // for a transposed computation, need to transpose the matrix before - // seeing which rows are longest - size_t maxNnzPerRow; - if (mode_is_transpose(mode)) { - auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); - maxNnzPerRow = - at.blockDim() * - KokkosSparse::Impl::graph_max_degree( - at.graph.row_map); - } else { - maxNnzPerRow = - a.blockDim() * - KokkosSparse::Impl::graph_max_degree( - a.graph.row_map); - } - /* assume that any floating-point op may introduce eps() error scaling y is one op dot product of x is two ops per entry (mul and add) @@ -601,8 +599,9 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, return std::make_tuple(x, y); } -template -void test_spm_mv_combos(const char *mode, const Bsr &a) { +template +void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow) { using execution_space = typename Bsr::execution_space; using scalar_type = typename Bsr::non_const_value_type; @@ -636,7 +635,7 @@ void test_spm_mv_combos(const char *mode, const Bsr &a) { scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spm_mv(ctrl, mode, alpha, beta, a, x, y); + test_spm_mv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } @@ -650,11 +649,24 @@ template ; + using Crs = KokkosSparse::CrsMatrix; for (auto mode : {"N", "T", "C", "H"}) { for (int bs : {1, 2, 5, 9}) { - test_spm_mv_combos(mode, bsr_corner_case_0_by_0(bs)); - test_spm_mv_combos(mode, bsr_corner_case_0_by_1(bs)); - test_spm_mv_combos(mode, bsr_corner_case_1_by_0(bs)); + { + auto A = bsr_corner_case_0_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spm_mv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_0_by_1(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spm_mv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_1_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spm_mv_combos(mode, A, Acrs, 0); + } } } } @@ -664,22 +676,37 @@ template ; + using Crs = KokkosSparse::CrsMatrix; // thoroughly test smaller matrices - for (auto mode : {"N", "T", "C", "H"}) { + std::vector> shapes = {{10, 10}, {10, 50}, {50, 10}}; + for (auto &shape : shapes) { for (int bs : {1, 2, 5, 9}) { - test_spm_mv_combos(mode, bsr_random(bs, 10, 10)); - test_spm_mv_combos(mode, bsr_random(bs, 10, 50)); - test_spm_mv_combos(mode, bsr_random(bs, 50, 10)); + auto A = bsr_random(bs, shape.first, shape.second); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T", "C", "H"}) { + test_spm_mv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } // test a tougher case on a big matrix - constexpr int blockSizePrime = 7; - constexpr int smallPrime = 11; - constexpr int largePrime = 499; - for (auto mode : {"N", "T"}) { - test_spm_mv_combos( - mode, bsr_random(blockSizePrime, smallPrime, largePrime)); + { + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + auto A = bsr_random(blockSizePrime, smallPrime, largePrime); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T"}) { + test_spm_mv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } From 9b90ffd82029b0322c7edcca112042c3a3eece0c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 09:39:17 -0600 Subject: [PATCH 127/231] sparse/unit_test: Initialize KernelHandle defaults --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 260bac8a83..662d81fd0b 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -56,7 +56,7 @@ namespace Test { // Run GS on the given vectors, where the handle is already set up. template void run_gauss_seidel( - Handle& kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, + Handle &kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, bool is_symmetric_graph, typename crsMat_t::value_type omega, int apply_type = 0 // 0 for symmetric, 1 for forward, 2 for backward. ) { @@ -144,8 +144,8 @@ void run_gauss_seidel( template void run_gauss_seidel_streams( - std::vector kh, std::vector input_mat, - std::vector x_vector, std::vector y_vector, + std::vector &kh, std::vector &input_mat, + std::vector &x_vector, std::vector &y_vector, bool is_symmetric_graph, typename crsMat_t::value_type omega, int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. int nstreams = 1) { @@ -289,8 +289,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View scalar_view2d_t; - typedef Kokkos::View + typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View host_scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; @@ -446,7 +446,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, // initial solution is zero Kokkos::deep_copy(x_host, zero); // get the inverse diagonal (only needed on host) - Kokkos::View invDiag("diag^-1", numRows); + Kokkos::View invDiag("diag^-1", numRows); for (lno_t i = 0; i < numRows; i++) { for (size_type j = rowmap(i); j < rowmap(i + 1); j++) { if (entries(j) == i) invDiag(i) = one / values(j); @@ -624,11 +624,11 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries); rowmap_view_t rowmapView( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1); - Kokkos::deep_copy(valuesView, Kokkos::View( + Kokkos::deep_copy(valuesView, Kokkos::View( values.data(), totalEntries)); - Kokkos::deep_copy(entriesView, Kokkos::View( + Kokkos::deep_copy(entriesView, Kokkos::View( entries.data(), totalEntries)); - Kokkos::deep_copy(rowmapView, Kokkos::View( + Kokkos::deep_copy(rowmapView, Kokkos::View( rowmap.data(), numRows + 1)); crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); @@ -786,6 +786,7 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); x_vector_v[i] = x_vector_tmp; + kh_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); } @@ -808,6 +809,7 @@ void test_gauss_seidel_streams_rank1( mag_t result_norm_res = KokkosBlas::nrm2(x_vector_v[i]); std::string info = "on stream_idx: " + std::to_string(i); EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + kh_v[i].destroy_gs_handle(); } } From 14cf5529f2bdc8cf90bc83f5c416ecdcb37140a6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 11:50:20 -0600 Subject: [PATCH 128/231] sparse/src: Fix PSGS stream cuda regressions --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 662d81fd0b..47ee0eb6f5 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -805,24 +805,14 @@ void test_gauss_seidel_streams_rank1( // Check result for (int i = 0; i < nstreams; i++) { - KokkosBlas::axpby(one, solution_x_v[i], -one, x_vector_v[i]); - mag_t result_norm_res = KokkosBlas::nrm2(x_vector_v[i]); + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); std::string info = "on stream_idx: " + std::to_string(i); EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; kh_v[i].destroy_gs_handle(); } } -#if 0 - lno_t numRows, - size_type nnz, - lno_t bandwidth, - lno_t row_size_variance, - bool symmetric, - double omega, - KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, - int nstreams = 1 -#endif #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ From 609e23cbb1b5854074ab421c415a4769de1e9194 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 14:18:01 -0600 Subject: [PATCH 129/231] .github/workflows/osx.yml: Double timeout --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 769957b953..8d9f7123f8 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -111,4 +111,4 @@ jobs: - name: test working-directory: kokkos-kernels/build - run: ctest -j2 --output-on-failure --timeout 3600 + run: ctest -j2 --output-on-failure --timeout 7200 From f05cbf117528e3e9e538870ab0628d4268ce643a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 15:22:09 -0600 Subject: [PATCH 130/231] sparse/src: Add gauss_seidel_symbolic overload --- docs/developer/apidocs/sparse.rst | 1 + .../impl/KokkosSparse_gauss_seidel_spec.hpp | 18 +++++--- sparse/src/KokkosSparse_gauss_seidel.hpp | 41 +++++++++++++++++-- .../src/KokkosSparse_gauss_seidel_handle.hpp | 12 ++++++ sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 17 ++++---- 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index f73b507439..091c0d02d7 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -60,6 +60,7 @@ block_spgemm gauss_seidel ------------ +.. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index f04ae34fc9..026c6c932d 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -120,14 +120,16 @@ struct gauss_seidel_apply_eti_spec_avail { namespace KokkosSparse { namespace Impl { -template ::value, bool eti_spec_avail = gauss_seidel_symbolic_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t>::value> struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, bool is_graph_symmetric); }; @@ -174,15 +176,19 @@ struct GAUSS_SEIDEL_APPLY { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct GAUSS_SEIDEL_SYMBOLIC +struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t_ entries, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_symbolic"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel< KernelHandle, a_size_view_t_, a_lno_view_t_, @@ -322,6 +328,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -337,6 +344,7 @@ struct GAUSS_SEIDEL_APPLY, \ diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 9f1b9d8cb1..93e88f5875 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,10 +29,13 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -42,9 +45,9 @@ namespace Experimental { /// num_rows submatrix of A is structurally symmetric /// @pre handle->create_gs_handle(...) has been called previously /// -template -void gauss_seidel_symbolic(KernelHandle *handle, +void gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -95,13 +98,43 @@ void gauss_seidel_symbolic(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_SYMBOLIC< - const_handle_type, Internal_alno_row_view_t_, - Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(&tmp_handle, num_rows, + ExecSpaceIn, const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(exec_space_in, + &tmp_handle, num_rows, num_cols, const_a_r, const_a_l, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity +/// pattern only) +/// +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously +/// +template +void gauss_seidel_symbolic(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_symbolic(my_exec_space, handle, num_rows, num_cols, row_map, + entries, is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (first phase, based on /// sparsity pattern only) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 134a100cc7..98624a4137 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -148,6 +148,18 @@ class GaussSeidelHandle { bool is_symbolic_called() const { return this->called_symbolic; } bool is_numeric_called() const { return this->called_numeric; } + template + void set_execution_space(const ExecSpaceIn exec_space_in) { + static bool is_set = false; + if (!is_set) { + static_assert(std::is_same::value, + "The type of exec_space_in should be the same as " + "GaussSeidelHandle::HandleExecSpace"); + this->execution_space = exec_space_in; + } + is_set = true; + } + void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 47ee0eb6f5..73b363a0b0 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -142,15 +142,17 @@ void run_gauss_seidel( kh.destroy_gs_handle(); } -template +template void run_gauss_seidel_streams( - std::vector &kh, std::vector &input_mat, - std::vector &x_vector, std::vector &y_vector, - bool is_symmetric_graph, typename crsMat_t::value_type omega, + std::vector &instances, std::vector &kh, + std::vector &input_mat, std::vector &x_vector, + std::vector &y_vector, bool is_symmetric_graph, + typename crsMat_t::value_type omega, int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. int nstreams = 1) { for (int i = 0; i < nstreams; i++) { - gauss_seidel_symbolic(&kh[i], input_mat[i].numRows(), + gauss_seidel_symbolic(instances[i], &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), input_mat[i].graph.row_map, input_mat[i].graph.entries, is_symmetric_graph); gauss_seidel_numeric(&kh[i], input_mat[i].numRows(), input_mat[i].numCols(), @@ -797,8 +799,9 @@ void test_gauss_seidel_streams_rank1( for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(x_vector_v[i], zero); - run_gauss_seidel_streams(kh_v, input_mat_v, x_vector_v, y_vector_v, - symmetric, m_omega, apply_type, nstreams); + run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, + y_vector_v, symmetric, m_omega, apply_type, + nstreams); // double gs = timer1.seconds(); // KokkosKernels::Impl::print_1Dview(x_vector); } From 92ad57faac33ef7db7db60180931805468e7396a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 15:43:53 -0600 Subject: [PATCH 131/231] docs: Add create_gs_handle docs --- docs/developer/apidocs/sparse.rst | 3 ++ sparse/src/KokkosKernels_Handle.hpp | 64 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 091c0d02d7..7dcf65ea2c 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -60,6 +60,9 @@ block_spgemm gauss_seidel ------------ +.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(HandleExecSpace, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 307ff7b91c..7776d746af 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -601,6 +601,30 @@ class KokkosKernelsHandle { "GS."); return cgs; } + + /** + * @brief Create a gauss seidel handle object + * + * @param handle_exec_space The execution space instance to execute kernels + * on. + * @param num_streams The number of streams to allocate memory for. + * @param gs_algorithm Specifies which algorithm to use: + * KokkosSpace::GS_DEFAULT PointGaussSeidel + * KokkosSpace::GS_PERMUTED ?? + * KokkosSpace::GS_TEAM ?? + * KokkosSpace::GS_CLUSTER ?? + * KokkosSpace::GS_TWOSTAGE ?? + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL + * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with + * bit array KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept + * here for backwards compatibility for SPGEMM and other use cases) + */ void create_gs_handle( HandleExecSpace handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, @@ -618,6 +642,26 @@ class KokkosKernelsHandle { handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); } + /** + * @brief Create a gauss seidel handle object + * + * @param gs_algorithm Specifies which algorithm to use: + * KokkosSpace::GS_DEFAULT PointGaussSeidel + * KokkosSpace::GS_PERMUTED ?? + * KokkosSpace::GS_TEAM ?? + * KokkosSpace::GS_CLUSTER ?? + * KokkosSpace::GS_TWOSTAGE ?? + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL + * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with + * bit array KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept + * here for backwards compatibility for SPGEMM and other use cases) + */ void create_gs_handle( KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = @@ -683,6 +727,26 @@ class KokkosKernelsHandle { gs2->setCompactForm(compact_form); } + /** + * @brief Create a gs handle object + * + * @param clusterAlgo Specifies which clustering algorithm to use: + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * @param hint_verts_per_cluster Hint how many verticies to use per cluster + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL + * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with + * bit array KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept + * here for backwards compatibility for SPGEMM and other use cases) + */ void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = From 21f8aae598f43f6ad668747d87b05264c873e877 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 15:56:50 -0600 Subject: [PATCH 132/231] docs: Improve docs formatting --- sparse/src/KokkosKernels_Handle.hpp | 92 +++++++++++++++++------------ 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 7776d746af..a23826f864 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -602,29 +602,33 @@ class KokkosKernelsHandle { return cgs; } + // clang-format off /** * @brief Create a gauss seidel handle object - * - * @param handle_exec_space The execution space instance to execute kernels - * on. + * + * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: + * * KokkosSpace::GS_DEFAULT PointGaussSeidel * KokkosSpace::GS_PERMUTED ?? * KokkosSpace::GS_TEAM ?? * KokkosSpace::GS_CLUSTER ?? * KokkosSpace::GS_TWOSTAGE ?? - * @param coloring_algorithm Specifies which coloring algorithm to color the - * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL - * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring - * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array - * KokkosGraph::COLORING_VBCS Vertex Based Color Set - * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring - * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with - * bit array KokkosGraph::COLORING_EB Edge Based Coloring - * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept - * here for backwards compatibility for SPGEMM and other use cases) + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) */ + // clang-format on void create_gs_handle( HandleExecSpace handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, @@ -642,26 +646,31 @@ class KokkosKernelsHandle { handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); } + // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param gs_algorithm Specifies which algorithm to use: + * * KokkosSpace::GS_DEFAULT PointGaussSeidel * KokkosSpace::GS_PERMUTED ?? * KokkosSpace::GS_TEAM ?? * KokkosSpace::GS_CLUSTER ?? * KokkosSpace::GS_TWOSTAGE ?? - * @param coloring_algorithm Specifies which coloring algorithm to color the - * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL - * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring - * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array - * KokkosGraph::COLORING_VBCS Vertex Based Color Set - * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring - * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with - * bit array KokkosGraph::COLORING_EB Edge Based Coloring - * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept - * here for backwards compatibility for SPGEMM and other use cases) + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) */ + // clang-format on void create_gs_handle( KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = @@ -727,26 +736,31 @@ class KokkosKernelsHandle { gs2->setCompactForm(compact_form); } + // clang-format off /** * @brief Create a gs handle object - * + * * @param clusterAlgo Specifies which clustering algorithm to use: - * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? - * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster - * @param coloring_algorithm Specifies which coloring algorithm to color the - * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL - * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring - * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array - * KokkosGraph::COLORING_VBCS Vertex Based Color Set - * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring - * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with - * bit array KokkosGraph::COLORING_EB Edge Based Coloring - * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept - * here for backwards compatibility for SPGEMM and other use cases) + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) */ + // clang-format on void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = From 1adc5e418ad6f3e6e2db53ce9a8b1dfee3a7315f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 14 Aug 2023 14:48:56 -0600 Subject: [PATCH 133/231] sparse/unit_test: Launch nrm2 on stream --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 73b363a0b0..90875e18bd 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -778,7 +778,7 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); solution_x_v[i] = solution_x_tmp; create_random_x_vector(solution_x_v[i]); - initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(instances[i], solution_x_v[i]); y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the // behavior of each algorithm _should be_ the same on every execution space, From 2082b66c2b5a7f0c10e993823e499824230b7f31 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 14 Aug 2023 18:14:13 -0600 Subject: [PATCH 134/231] sparse/src: Add GS numeric overload --- .../impl/KokkosSparse_gauss_seidel_spec.hpp | 32 ++-- sparse/src/KokkosSparse_gauss_seidel.hpp | 153 ++++++++++++++++-- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 7 +- 3 files changed, 168 insertions(+), 24 deletions(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 026c6c932d..b9a89d2579 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -135,20 +135,23 @@ struct GAUSS_SEIDEL_SYMBOLIC { }; template < - class KernelHandle, KokkosSparse::SparseMatrixFormat format, - class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, + class ExecSpaceIn, class KernelHandle, + KokkosSparse::SparseMatrixFormat format, class a_size_view_t_, + class a_lno_view_t, class a_scalar_view_t, bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value, bool eti_spec_avail = gauss_seidel_numeric_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value> struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric); static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); @@ -212,17 +215,20 @@ struct GAUSS_SEIDEL_SYMBOLIC -struct GAUSS_SEIDEL_NUMERIC +struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidelget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel, \ @@ -375,6 +384,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -394,6 +404,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -409,6 +420,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 93e88f5875..b7485e68dc 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -175,12 +175,15 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -190,11 +193,12 @@ void block_gauss_seidel_symbolic( /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template -void gauss_seidel_numeric(KernelHandle *handle, +void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -258,9 +262,10 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, + &tmp_handle, num_rows, num_cols, const_a_r, const_a_l, const_a_v, is_graph_symmetric); @@ -284,7 +289,6 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param row_map The matrix's rowmap /// @param entries The matrix's entries /// @param values The matrix's values -/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// @remark If the inverse diagonal is not already available, it's best to call @@ -296,6 +300,50 @@ template void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, is_graph_symmetric); +} + +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -362,25 +410,71 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, + &tmp_handle, num_rows, num_cols, const_a_r, const_a_l, const_a_v, const_a_d, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + scalar_nnz_view_t_ given_inverse_diagonal, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, given_inverse_diagonal, + is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (second phase, based on /// matrix's numeric values) /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -391,12 +485,14 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template void block_gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -409,8 +505,43 @@ void block_gauss_seidel_numeric( } gsHandle->set_block_size(block_size); - gauss_seidel_numeric(handle, num_rows, num_cols, row_map, entries, - values, is_graph_symmetric); + gauss_seidel_numeric(exec_space_in, handle, num_rows, num_cols, + row_map, entries, values, is_graph_symmetric); +} + +/// +/// @brief Block Gauss-Seidel preconditioner setup (second phase, based on +/// matrix's numeric values) +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param handle handle A KokkosKernelsHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param block_size The number of degrees of freedom per block +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// +template +void block_gauss_seidel_numeric( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, is_graph_symmetric); } /// diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 90875e18bd..0a5849cbb0 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -155,9 +155,10 @@ void run_gauss_seidel_streams( gauss_seidel_symbolic(instances[i], &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), input_mat[i].graph.row_map, input_mat[i].graph.entries, is_symmetric_graph); - gauss_seidel_numeric(&kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, is_symmetric_graph); + gauss_seidel_numeric(instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, + is_symmetric_graph); } const int apply_count = 2; From dd3fbb5375680ccf4ea299a64e95eeac6b617dfa Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 14 Aug 2023 18:15:19 -0600 Subject: [PATCH 135/231] sparse/src: Add GS apply overload. - Use execution space instance throughout. --- common/src/KokkosKernels_SimpleUtils.hpp | 4 - common/src/KokkosKernels_Utils.hpp | 60 +++-- docs/developer/apidocs/sparse.rst | 5 + .../impl/KokkosSparse_gauss_seidel_impl.hpp | 68 ++++-- .../impl/KokkosSparse_gauss_seidel_spec.hpp | 32 ++- sparse/src/KokkosSparse_gauss_seidel.hpp | 224 +++++++++++++++--- .../src/KokkosSparse_gauss_seidel_handle.hpp | 3 +- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 55 +++-- 8 files changed, 339 insertions(+), 112 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 64735874c6..e25ec54eb0 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -142,10 +142,6 @@ inline void kk_exclusive_parallel_prefix_sum( kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } -template -void kk_inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, - forward_array_type arr) {} - /// /// \brief Function performs the inclusive parallel prefix sum. That is each /// entry holds the sum until itself including itself. diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 11b4100f31..552e994892 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -687,9 +687,11 @@ void create_reverse_map( typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = - reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1); + reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), + num_reverse_elements + 1); reverse_map_adj = reverse_array_type( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "REVERSE_ADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "REVERSE_ADJ"), num_forward_elements); if (num_reverse_elements < MINIMUM_TO_ATOMIC) { @@ -703,7 +705,9 @@ void create_reverse_map( const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) << multiply_shift_for_scale; - reverse_array_type tmp_color_xadj("TMP_REVERSE_XADJ", tmp_reverse_size + 1); + reverse_array_type tmp_color_xadj( + Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), + tmp_reverse_size + 1); Reverse_Map_Scale_Init rmi( forward_map, tmp_color_xadj, multiply_shift_for_scale, @@ -714,7 +718,7 @@ void create_reverse_map( my_exec_space.fence(); inclusive_parallel_prefix_sum( - tmp_reverse_size + 1, tmp_color_xadj); + my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); my_exec_space.fence(); Kokkos::parallel_for( @@ -734,7 +738,8 @@ void create_reverse_map( // atomic implementation. { reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "TMP_REVERSE_XADJ"), num_reverse_elements + 1); Reverse_Map_Init rmi( @@ -747,9 +752,8 @@ void create_reverse_map( // print_1Dview(reverse_map_xadj); inclusive_parallel_prefix_sum( - num_reverse_elements + 1, reverse_map_xadj); - my_exec_space.fence(); - Kokkos::deep_copy(tmp_color_xadj, reverse_map_xadj); + my_exec_space, num_reverse_elements + 1, reverse_map_xadj); + Kokkos::deep_copy(my_exec_space, tmp_color_xadj, reverse_map_xadj); my_exec_space.fence(); Fill_Reverse_Map frm( forward_map, tmp_color_xadj, reverse_map_adj); @@ -804,18 +808,30 @@ struct PermuteVector { template -void permute_vector(typename idx_array_type::value_type num_elements, +void permute_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteVector( old_vector, new_vector, old_to_new_index_map)); } +template +void permute_vector(typename idx_array_type::value_type num_elements, + idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, + new_vector); +} + template struct PermuteBlockVector { @@ -849,19 +865,30 @@ struct PermuteBlockVector { template -void permute_block_vector(typename idx_array_type::value_type num_elements, +void permute_block_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, int block_size, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; - + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteBlockVector(block_size, old_vector, new_vector, old_to_new_index_map)); } +template +void permute_block_vector(typename idx_array_type::value_type num_elements, + int block_size, idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_block_vector(MyExecSpace(), num_elements, block_size, + old_to_new_index_map, old_vector, new_vector); +} + // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. @@ -1301,8 +1328,7 @@ void kk_view_reduce_max_row_size(const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, size_type &max_row_size) { - MyExecSpace my_exec_space; - return kk_view_reduce_max_row_size(my_exec_space, num_rows, + return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, rowmap_view_begins, rowmap_view_ends, max_row_size); } diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 7dcf65ea2c..1f7e702fb4 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -65,10 +65,15 @@ gauss_seidel .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 41809203e2..d9d45dbb85 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -874,6 +874,9 @@ class PointGaussSeidel { colors = gchandle->get_vertex_colors(); numColors = gchandle->get_num_colors(); } + // Wait for coloring to finish on its stream + using ColoringExecSpace = typename HandleType::HandleExecSpace; + ColoringExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "COLORING_TIME:" << timer.seconds() << std::endl; timer.reset(); @@ -921,11 +924,11 @@ class PointGaussSeidel { // Count long rows per color set, and sort color sets so that long rows // come after regular rows nnz_lno_persistent_work_view_t long_rows_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "long_rows_per_color"), numColors); nnz_lno_persistent_work_view_t max_row_length_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "max_row_length_per_color"), numColors); nnz_lno_t mostLongRowsInColor = 0; @@ -954,7 +957,8 @@ class PointGaussSeidel { my_exec_space.fence(); gsHandle->set_max_row_length_per_color(host_max_row_length_per_color); scalar_persistent_work_view_t long_row_x( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "long_row_x"), mostLongRowsInColor); gsHandle->set_long_row_x(long_row_x); } else { @@ -1134,6 +1138,7 @@ class PointGaussSeidel { gsHandle->set_new_adj(permuted_adj); gsHandle->set_old_to_new_map(old_to_new_map); gsHandle->set_call_symbolic(true); + my_exec_space.fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "ALLOC:" << timer.seconds() << std::endl; #endif @@ -1352,6 +1357,7 @@ class PointGaussSeidel { const_lno_row_view_t xadj = this->row_map; const_lno_nnz_view_t adj = this->entries; const_scalar_nnz_view_t adj_vals = this->values; + MyExecSpace my_exec_space = gsHandle->get_execution_space(); size_type nnz = adj_vals.extent(0); @@ -1362,14 +1368,16 @@ class PointGaussSeidel { nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); scalar_persistent_work_view_t permuted_adj_vals( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "newvals_"), nnz); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "newvals_"), + nnz); int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz); int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); nnz_lno_t block_size = gsHandle->get_block_size(); nnz_lno_t block_matrix_size = block_size * block_size; @@ -1393,7 +1401,8 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), fill_matrix_numeric(color_adj, xadj, // adj, @@ -1405,7 +1414,7 @@ class PointGaussSeidel { block_matrix_size)); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::fill_matrix_numeric", - range_policy_t(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_numeric(color_adj, xadj, // adj, adj_vals, newxadj_, @@ -1418,7 +1427,7 @@ class PointGaussSeidel { gsHandle->set_new_adj_val(permuted_adj_vals); scalar_persistent_work_view_t permuted_inverse_diagonal( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "permuted_inverse_diagonal"), num_rows * block_size); if (!have_diagonal_given) { @@ -1430,13 +1439,14 @@ class PointGaussSeidel { block_size > 1) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::team_get_matrix_diagonals", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd); } else { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::get_matrix_diagonals", - range_policy_t(0, num_rows), gmd); + range_policy_t(my_exec_space, 0, num_rows), gmd); } } else { @@ -1444,13 +1454,13 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_block_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, given_inverse_diagonal, - permuted_inverse_diagonal); + my_exec_space, num_rows, block_size, old_to_new_map, + given_inverse_diagonal, permuted_inverse_diagonal); else KokkosKernels::Impl::permute_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, given_inverse_diagonal, + my_exec_space, num_rows, old_to_new_map, given_inverse_diagonal, permuted_inverse_diagonal); } @@ -1458,7 +1468,7 @@ class PointGaussSeidel { gsHandle->set_call_numeric(true); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "NUMERIC:" << timer.seconds() << std::endl; #endif } @@ -1684,6 +1694,7 @@ class PointGaussSeidel { nnz_lno_persistent_work_view_t, MyExecSpace>( num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG + Kokkos::fence(); std::cout << "--point After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); std::cout << "--point Result X:"; @@ -1722,7 +1733,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1733,7 +1745,7 @@ class PointGaussSeidel { max_row_length_per_color = gsHandle->get_max_row_length_per_color(); long_row_x = gsHandle->get_long_row_x(); haveLongRows = true; - longrow_apply_team_policy_t tempPolicy(1, 1); + longrow_apply_team_policy_t tempPolicy(my_exec_space, 1, 1); longRowTeamSize = tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag()); } @@ -1782,7 +1794,8 @@ class PointGaussSeidel { Kokkos::parallel_for( labelRegular, Kokkos::Experimental::require( - team_policy_t((numRegularRows + team_row_chunk_size - 1) / + team_policy_t(my_exec_space, + (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), @@ -1792,6 +1805,7 @@ class PointGaussSeidel { labelBlock, Kokkos::Experimental::require( block_apply_team_policy_t( + my_exec_space, (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), @@ -1802,6 +1816,7 @@ class PointGaussSeidel { labelBigBlock, Kokkos::Experimental::require( bigblock_apply_team_policy_t( + my_exec_space, (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), @@ -1828,14 +1843,16 @@ class PointGaussSeidel { Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( - longrow_apply_team_policy_t(numLongRows * teams_per_row, + longrow_apply_team_policy_t(my_exec_space, + numLongRows * teams_per_row, longRowTeamSize), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", Kokkos::Experimental::require( - range_policy_t(color_index_end - numLongRows, + range_policy_t(my_exec_space, + color_index_end - numLongRows, color_index_end), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( @@ -1852,7 +1869,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1889,7 +1907,7 @@ class PointGaussSeidel { Kokkos::parallel_for( labelShort, Kokkos::Experimental::require( - range_policy_t(color_index_begin, + range_policy_t(my_exec_space, color_index_begin, color_index_end - numLongRows), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); @@ -1906,18 +1924,20 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); + my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( Kokkos::RangePolicy( - 0, numLongRows * par_per_row), + my_exec_space, 0, numLongRows * par_per_row), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", Kokkos::Experimental::require( - range_policy_t(color_index_end - numLongRows, + range_policy_t(my_exec_space, + color_index_end - numLongRows, color_index_end), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index b9a89d2579..84c9dccf5c 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -157,9 +157,10 @@ struct GAUSS_SEIDEL_NUMERIC { a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); }; -template ::value, @@ -168,7 +169,8 @@ template ::value> struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -287,14 +289,17 @@ struct GAUSS_SEIDEL_NUMERIC -struct GAUSS_SEIDEL_APPLY { +template +struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -303,6 +308,7 @@ struct GAUSS_SEIDEL_APPLYget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel, \ @@ -461,6 +468,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -486,6 +494,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -507,6 +516,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index b7485e68dc..505b8c55a8 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -505,8 +505,13 @@ void block_gauss_seidel_numeric( } gsHandle->set_block_size(block_size); - gauss_seidel_numeric(exec_space_in, handle, num_rows, num_cols, - row_map, entries, values, is_graph_symmetric); + gauss_seidel_numeric(exec_space_in, handle, num_rows, + num_cols, row_map, entries, values, + is_graph_symmetric); + + /* gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, given_inverse_diagonal, + is_graph_symmetric); */ } /// @@ -540,14 +545,16 @@ void block_gauss_seidel_numeric( lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric = true) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, is_graph_symmetric); + block_gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, + block_size, row_map, entries, values, + is_graph_symmetric); } /// /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -558,6 +565,8 @@ void block_gauss_seidel_numeric( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -574,13 +583,15 @@ void block_gauss_seidel_numeric( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void symmetric_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -696,13 +707,63 @@ void symmetric_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, true); + gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, + const_a_r, const_a_l, const_a_v, nonconst_x_v, + const_y_v, init_zero_x_vector, update_y_vector, omega, + numIter, true, true); +} + +/// +/// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to +/// system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle handle A KokkosKernelsHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run (forward and backward counts as 1) +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void symmetric_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + symmetric_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, + row_map, entries, values, x_lhs_output_vec, + y_rhs_input_vec, init_zero_x_vector, + update_y_vector, omega, numIter); } /// @@ -785,6 +846,8 @@ void symmetric_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -801,13 +864,15 @@ void symmetric_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void forward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -925,13 +990,62 @@ void forward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, false); + gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, + const_a_r, const_a_l, const_a_v, nonconst_x_v, + const_y_v, init_zero_x_vector, update_y_vector, omega, + numIter, true, false); +} + +/// +/// @brief Apply forward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void forward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + forward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, + row_map, entries, values, x_lhs_output_vec, + y_rhs_input_vec, init_zero_x_vector, + update_y_vector, omega, numIter); } /// @@ -1003,6 +1117,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -1013,6 +1128,8 @@ void forward_sweep_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -1029,13 +1146,15 @@ void forward_sweep_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void backward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -1153,13 +1272,62 @@ void backward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, false, true); + gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, + const_a_r, const_a_l, const_a_v, nonconst_x_v, + const_y_v, init_zero_x_vector, update_y_vector, omega, + numIter, false, true); +} + +/// +/// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void backward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + backward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, + row_map, entries, values, x_lhs_output_vec, + y_rhs_input_vec, init_zero_x_vector, + update_y_vector, omega, numIter); } /// diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 98624a4137..80ba6e5153 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -104,8 +104,7 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : execution_space(HandleExecSpace()), - num_streams(1), + : num_streams(1), algorithm_type(gs), color_xadj(), color_adj(), diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 0a5849cbb0..820d9ae447 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -166,31 +166,31 @@ void run_gauss_seidel_streams( switch (apply_type) { case 0: symmetric_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; case 1: forward_sweep_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; case 2: backward_sweep_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; default: symmetric_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; } } @@ -779,7 +779,7 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); solution_x_v[i] = solution_x_tmp; create_random_x_vector(solution_x_v[i]); - initial_norm_res_v[i] = KokkosBlas::nrm2(instances[i], solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the // behavior of each algorithm _should be_ the same on every execution space, @@ -798,23 +798,26 @@ void test_gauss_seidel_streams_rank1( for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; - for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(x_vector_v[i], zero); + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + for (int i = 0; i < nstreams; i++) instances[i].fence(); run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, y_vector_v, symmetric, m_omega, apply_type, nstreams); // double gs = timer1.seconds(); // KokkosKernels::Impl::print_1Dview(x_vector); + for (int i = 0; i < nstreams; i++) { + instances[i].fence(); // Wait for apply to finish updating x_vector + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + std::string info = "on stream_idx: " + std::to_string(i); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + } } - // Check result - for (int i = 0; i < nstreams; i++) { - KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, x_vector_v[i]); - mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); - std::string info = "on stream_idx: " + std::to_string(i); - EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; - kh_v[i].destroy_gs_handle(); - } + for (int i = 0; i < nstreams; i++) kh_v[i].destroy_gs_handle(); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From a6407290f4b393f5b5f20068b6de921c490b544a Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 21 Aug 2023 00:54:13 -0700 Subject: [PATCH 136/231] Add an utility function to extract diagonal blocks from a crsmatrix --- sparse/src/KokkosSparse_Utils.hpp | 114 ++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 4039b6f5a7..a33d3e9d8e 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2330,6 +2330,120 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, } } +template +void kk_extract_diagonal_blocks_crsmatrix_sequential(const crsMat_t &A, + std::vector& DiagBlk_v) { + using row_map_type = typename crsMat_t::row_map_type; + using entries_type = typename crsMat_t::index_type; + using values_type = typename crsMat_t::values_type; + using row_map_hostmirror_type = typename row_map_type::HostMirror; + using entries_hostmirror_type = typename entries_type::HostMirror; + using values_hostmirror_type = typename values_type::HostMirror; + using int_view1d_type = Kokkos::View; + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using out_row_map_type = typename graph_t::row_map_type::non_const_type; + using out_entries_type = typename graph_t::entries_type::non_const_type; + using out_values_type = typename crsMat_t::values_type::non_const_type; + using out_row_map_hostmirror_type = typename out_row_map_type::HostMirror; + using out_entries_hostmirror_type = typename out_entries_type::HostMirror; + using out_values_hostmirror_type = typename out_values_type::HostMirror; + + row_map_type A_row_map = A.graph.row_map; + entries_type A_entries = A.graph.entries; + values_type A_values = A.values; + + row_map_hostmirror_type A_row_map_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_row_map); + entries_hostmirror_type A_entries_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_entries); + values_hostmirror_type A_values_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_values); + + int A_nrows = static_cast(A_row_map.extent(0))-1; + int n_blocks = static_cast(DiagBlk_v.size()); + + int rows_per_block = ((A_nrows%n_blocks)==0) ? (A_nrows/n_blocks) : (A_nrows/n_blocks+1); + + std::vector row_map_v(n_blocks); + std::vector entries_v(n_blocks); + std::vector values_v (n_blocks); + std::vector row_map_h_v(n_blocks); + std::vector entries_h_v(n_blocks); + std::vector values_h_v (n_blocks); + + int row_start = 0; // first row index of i-th diagonal block + int col_start = 0; // first col index of i-th diagonal block + int nrows, ncols; // Nrows, Ncols of i-th diagonal block + for (int i = 0; i < n_blocks; i++) { + nrows = rows_per_block; + if ((row_start + rows_per_block) > A_nrows) { + nrows = A_nrows - row_start; + } + col_start = row_start; + ncols = nrows; + + // Rowmap of i-th row-oriented sub-matrix + auto A_row_map_sub = Kokkos::subview(A_row_map_h, Kokkos::make_pair(row_start, row_start + nrows + 1)); + + // First round: count i-th non-zeros or size of entries_v[i] + int n_entries = 0; + int_view1d_type first("first", nrows); // first position per row + int_view1d_type last ("last", nrows); // last position per row + + for (int j = 0; j < nrows; j++) { // loop through each row + int k1 = static_cast(A_row_map_sub(j)); + int k2 = static_cast(A_row_map_sub(j + 1)); + int k; + // Assume column indices are sorted in ascending order + // Find the position of the start column in the row + for (k = k1; k < k2; k++) { + int col = static_cast(A_entries_h(k)); + if (col >= col_start) { + break; + } + } + first(j) = k; + // Find the position of the last column in the row + for (k = k2-1; k >= k1; k--) { + int col = static_cast(A_entries_h(k)); + if (col < col_start + ncols) { + break; + } + } + last(j) = k; + n_entries += (last(j) - first(j) + 1); + } + + // Second round: + // - create row_map_v[i] + // - copy A_entries to entries_v[i] and update entries_v[i] with local column indices + // - copy A_values to values_v[i] + row_map_v[i] = out_row_map_type("row_map_v", nrows + 1); + entries_v[i] = out_entries_type("entries_v", n_entries); + values_v[i] = out_values_type ("values_v", n_entries); + row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", nrows + 1); + entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", n_entries); + values_h_v[i] = out_values_hostmirror_type ("values_h_v", n_entries); + int first_ = 0; + for (int j = 0; j < nrows; j++) { // loop through each row + int nnz = last(j) - first(j) + 1; + row_map_h_v[i](j) = first_; + for (int k = 0; k < nnz; k++) { + entries_h_v[i](first_ + k) = A_entries_h(first(j) + k) - col_start; + values_h_v[i] (first_ + k) = A_values_h (first(j) + k); + } + first_ += nnz; + } + row_map_h_v[i](nrows) = n_entries; // last element + + Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); + Kokkos::deep_copy(entries_v[i], entries_h_v[i]); + Kokkos::deep_copy(values_v[i], values_h_v[i]); + + DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, values_v[i], row_map_v[i], entries_v[i]); + + row_start += nrows; + } +} + } // namespace Impl using Impl::isCrsGraphSorted; From 90bef9f9efdb00a132b1a7710fa0e8f1d87d04b6 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Mon, 21 Aug 2023 02:08:32 -0600 Subject: [PATCH 137/231] Apply clang format --- sparse/src/KokkosSparse_Utils.hpp | 80 +++++++++++++++++-------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index a33d3e9d8e..0e5dc6d9b0 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2331,17 +2331,18 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, } template -void kk_extract_diagonal_blocks_crsmatrix_sequential(const crsMat_t &A, - std::vector& DiagBlk_v) { - using row_map_type = typename crsMat_t::row_map_type; - using entries_type = typename crsMat_t::index_type; - using values_type = typename crsMat_t::values_type; +void kk_extract_diagonal_blocks_crsmatrix_sequential( + const crsMat_t &A, std::vector &DiagBlk_v) { + using row_map_type = typename crsMat_t::row_map_type; + using entries_type = typename crsMat_t::index_type; + using values_type = typename crsMat_t::values_type; using row_map_hostmirror_type = typename row_map_type::HostMirror; using entries_hostmirror_type = typename entries_type::HostMirror; using values_hostmirror_type = typename values_type::HostMirror; - using int_view1d_type = Kokkos::View; + using int_view1d_type = + Kokkos::View; - using graph_t = typename crsMat_t::StaticCrsGraphType; + using graph_t = typename crsMat_t::StaticCrsGraphType; using out_row_map_type = typename graph_t::row_map_type::non_const_type; using out_entries_type = typename graph_t::entries_type::non_const_type; using out_values_type = typename crsMat_t::values_type::non_const_type; @@ -2351,44 +2352,49 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential(const crsMat_t &A, row_map_type A_row_map = A.graph.row_map; entries_type A_entries = A.graph.entries; - values_type A_values = A.values; + values_type A_values = A.values; - row_map_hostmirror_type A_row_map_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_row_map); - entries_hostmirror_type A_entries_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_entries); - values_hostmirror_type A_values_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_values); + row_map_hostmirror_type A_row_map_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_row_map); + entries_hostmirror_type A_entries_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_entries); + values_hostmirror_type A_values_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_values); - int A_nrows = static_cast(A_row_map.extent(0))-1; + int A_nrows = static_cast(A_row_map.extent(0)) - 1; int n_blocks = static_cast(DiagBlk_v.size()); - - int rows_per_block = ((A_nrows%n_blocks)==0) ? (A_nrows/n_blocks) : (A_nrows/n_blocks+1); + + int rows_per_block = ((A_nrows % n_blocks) == 0) ? (A_nrows / n_blocks) + : (A_nrows / n_blocks + 1); std::vector row_map_v(n_blocks); std::vector entries_v(n_blocks); - std::vector values_v (n_blocks); + std::vector values_v(n_blocks); std::vector row_map_h_v(n_blocks); std::vector entries_h_v(n_blocks); - std::vector values_h_v (n_blocks); + std::vector values_h_v(n_blocks); - int row_start = 0; // first row index of i-th diagonal block - int col_start = 0; // first col index of i-th diagonal block - int nrows, ncols; // Nrows, Ncols of i-th diagonal block + int row_start = 0; // first row index of i-th diagonal block + int col_start = 0; // first col index of i-th diagonal block + int nrows, ncols; // Nrows, Ncols of i-th diagonal block for (int i = 0; i < n_blocks; i++) { nrows = rows_per_block; if ((row_start + rows_per_block) > A_nrows) { nrows = A_nrows - row_start; } col_start = row_start; - ncols = nrows; + ncols = nrows; // Rowmap of i-th row-oriented sub-matrix - auto A_row_map_sub = Kokkos::subview(A_row_map_h, Kokkos::make_pair(row_start, row_start + nrows + 1)); + auto A_row_map_sub = Kokkos::subview( + A_row_map_h, Kokkos::make_pair(row_start, row_start + nrows + 1)); // First round: count i-th non-zeros or size of entries_v[i] int n_entries = 0; - int_view1d_type first("first", nrows); // first position per row - int_view1d_type last ("last", nrows); // last position per row + int_view1d_type first("first", nrows); // first position per row + int_view1d_type last("last", nrows); // last position per row - for (int j = 0; j < nrows; j++) { // loop through each row + for (int j = 0; j < nrows; j++) { // loop through each row int k1 = static_cast(A_row_map_sub(j)); int k2 = static_cast(A_row_map_sub(j + 1)); int k; @@ -2402,7 +2408,7 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential(const crsMat_t &A, } first(j) = k; // Find the position of the last column in the row - for (k = k2-1; k >= k1; k--) { + for (k = k2 - 1; k >= k1; k--) { int col = static_cast(A_entries_h(k)); if (col < col_start + ncols) { break; @@ -2414,31 +2420,33 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential(const crsMat_t &A, // Second round: // - create row_map_v[i] - // - copy A_entries to entries_v[i] and update entries_v[i] with local column indices + // - copy A_entries to entries_v[i] and update entries_v[i] with local + // column indices // - copy A_values to values_v[i] - row_map_v[i] = out_row_map_type("row_map_v", nrows + 1); - entries_v[i] = out_entries_type("entries_v", n_entries); - values_v[i] = out_values_type ("values_v", n_entries); + row_map_v[i] = out_row_map_type("row_map_v", nrows + 1); + entries_v[i] = out_entries_type("entries_v", n_entries); + values_v[i] = out_values_type("values_v", n_entries); row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", nrows + 1); entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", n_entries); - values_h_v[i] = out_values_hostmirror_type ("values_h_v", n_entries); - int first_ = 0; - for (int j = 0; j < nrows; j++) { // loop through each row - int nnz = last(j) - first(j) + 1; + values_h_v[i] = out_values_hostmirror_type("values_h_v", n_entries); + int first_ = 0; + for (int j = 0; j < nrows; j++) { // loop through each row + int nnz = last(j) - first(j) + 1; row_map_h_v[i](j) = first_; for (int k = 0; k < nnz; k++) { entries_h_v[i](first_ + k) = A_entries_h(first(j) + k) - col_start; - values_h_v[i] (first_ + k) = A_values_h (first(j) + k); + values_h_v[i](first_ + k) = A_values_h(first(j) + k); } first_ += nnz; } - row_map_h_v[i](nrows) = n_entries; // last element + row_map_h_v[i](nrows) = n_entries; // last element Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); Kokkos::deep_copy(entries_v[i], entries_h_v[i]); Kokkos::deep_copy(values_v[i], values_h_v[i]); - DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, values_v[i], row_map_v[i], entries_v[i]); + DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, values_v[i], + row_map_v[i], entries_v[i]); row_start += nrows; } From 5c41061189d9ca4d218368e97405ffdebf195874 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 21 Aug 2023 16:26:36 -0600 Subject: [PATCH 138/231] Various fixes --- common/src/KokkosKernels_Utils.hpp | 17 +++++- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 42 ++++++++------- sparse/src/KokkosSparse_Utils.hpp | 17 ++++-- sparse/src/KokkosSparse_gauss_seidel.hpp | 54 ++----------------- .../src/KokkosSparse_gauss_seidel_handle.hpp | 3 +- 5 files changed, 58 insertions(+), 75 deletions(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 552e994892..c6780185a4 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -892,11 +892,24 @@ void permute_block_vector(typename idx_array_type::value_type num_elements, // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. +template +void zero_vector(ExecSpaceIn &exec_space_in, + typename value_array_type::value_type /* num_elements */, + value_array_type &vector) { + typedef typename value_array_type::non_const_value_type val_type; + Kokkos::deep_copy(exec_space_in, vector, + Kokkos::ArithTraits::zero()); + exec_space_in.fence(); +} + template void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { - typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(vector, Kokkos::ArithTraits::zero()); + using ne_tmp_t = typename value_array_type::value_type; + ne_tmp_t ne_tmp = ne_tmp_t(0); + MyExecSpace my_exec_space; + zero_vector(my_exec_space, ne_tmp, + vector); } template diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index d9d45dbb85..1f386a28a4 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1091,8 +1091,8 @@ class PointGaussSeidel { size_type num_large_rows = 0; KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( - brows, permuted_xadj, num_values_in_l1, num_large_rows, - my_exec_space); + my_exec_space, brows, permuted_xadj, num_values_in_l1, + num_large_rows); num_big_rows = KOKKOSKERNELS_MACRO_MIN( num_large_rows, (size_type)(my_exec_space.concurrency() / suggested_vector_size)); @@ -1463,7 +1463,6 @@ class PointGaussSeidel { my_exec_space, num_rows, old_to_new_map, given_inverse_diagonal, permuted_inverse_diagonal); } - gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal); gsHandle->set_call_numeric(true); } @@ -1530,24 +1529,25 @@ class PointGaussSeidel { scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal(); - color_t numColors = gsHandle->get_num_colors(); + color_t numColors = gsHandle->get_num_colors(); + auto my_exec_space = gsHandle->get_execution_space(); if (update_y_vector) { KokkosKernels::Impl::permute_block_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, y_rhs_input_vec, + my_exec_space, num_rows, block_size, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols * block_size, - Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols * block_size, Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, old_to_new_map, x_lhs_output_vec, + my_exec_space, num_cols, block_size, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); } @@ -1580,7 +1580,7 @@ class PointGaussSeidel { int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), brows); + suggested_team_size, my_exec_space.concurrency(), brows); // size_t shmem_size_to_use = this->handle->get_shmem_size(); size_t l1_shmem_size = gsHandle->get_level_1_mem(); @@ -1613,7 +1613,8 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, block_size, color_adj, Permuted_Xvector, + x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); @@ -1631,7 +1632,8 @@ class PointGaussSeidel { nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); auto Permuted_Xvector = gsHandle->get_permuted_x_vector(); auto Permuted_Yvector = gsHandle->get_permuted_y_vector(); @@ -1651,16 +1653,19 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); + my_exec_space, num_rows, old_to_new_map, y_rhs_input_vec, + Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols, Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols, Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); + my_exec_space, num_cols, old_to_new_map, x_lhs_output_vec, + Permuted_Xvector); } #if KOKKOSSPARSE_IMPL_PRINTDEBUG @@ -1692,7 +1697,7 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG Kokkos::fence(); std::cout << "--point After X:"; @@ -1839,7 +1844,8 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); + my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 88258356ef..f61f470814 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -1885,10 +1885,9 @@ struct ReduceLargerRowCount { template void kk_reduce_numrows_larger_than_threshold( - size_t num_elements, view_type view_to_reduce, - typename view_type::const_value_type threshold, - typename view_type::non_const_value_type &sum_reduction, - MyExecSpace my_exec_space = MyExecSpace()) { + const MyExecSpace &my_exec_space, size_t num_elements, + view_type view_to_reduce, typename view_type::const_value_type threshold, + typename view_type::non_const_value_type &sum_reduction) { typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", @@ -1897,6 +1896,16 @@ void kk_reduce_numrows_larger_than_threshold( sum_reduction); } +template +void kk_reduce_numrows_larger_than_threshold( + size_t num_elements, view_type view_to_reduce, + typename view_type::const_value_type threshold, + typename view_type::non_const_value_type &sum_reduction) { + MyExecSpace my_exec_space; + kk_reduce_numrows_larger_than_threshold( + my_exec_space, num_elements, view_to_reduce, threshold, sum_reduction); +} + // Note: "block" in member name means it's block internal - otherwise it // addresses sparse rows/columns (whole blocks) within whole matrix. template diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 505b8c55a8..9db3a1b2d3 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -466,15 +466,12 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @brief Block Gauss-Seidel preconditioner setup (second phase, based on /// matrix's numeric values) /// -/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param exec_space_in The execution space instance this kernel will be run -/// on. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -485,14 +482,12 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template void block_gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, - typename KernelHandle::const_nnz_lno_t num_rows, + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -505,49 +500,8 @@ void block_gauss_seidel_numeric( } gsHandle->set_block_size(block_size); - gauss_seidel_numeric(exec_space_in, handle, num_rows, - num_cols, row_map, entries, values, - is_graph_symmetric); - - /* gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, given_inverse_diagonal, - is_graph_symmetric); */ -} - -/// -/// @brief Block Gauss-Seidel preconditioner setup (second phase, based on -/// matrix's numeric values) -/// -/// @tparam format The matrix storage format, CRS or BSR -/// @tparam KernelHandle A specialization of -/// KokkosKernels::Experimental::KokkosKernelsHandle -/// @tparam lno_row_view_t_ The matrix's rowmap type -/// @tparam lno_nnz_view_t_ The matrix's entries type -/// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param handle handle A KokkosKernelsHandle instance -/// @param num_rows Number of rows in the matrix -/// @param num_cols Number of columns in the matrix -/// @param block_size The number of degrees of freedom per block -/// @param row_map The matrix's rowmap -/// @param entries The matrix's entries -/// @param values The matrix's values -/// @param is_graph_symmetric Whether the upper-left num_rows x -/// num_rows submatrix of A is structurally symmetric -/// -template -void block_gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, - typename KernelHandle::const_nnz_lno_t num_cols, - typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, - bool is_graph_symmetric = true) { - auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - block_gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, - block_size, row_map, entries, values, - is_graph_symmetric); + gauss_seidel_numeric(handle, num_rows, num_cols, row_map, entries, + values, is_graph_symmetric); } /// diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 80ba6e5153..98624a4137 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -104,7 +104,8 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : num_streams(1), + : execution_space(HandleExecSpace()), + num_streams(1), algorithm_type(gs), color_xadj(), color_adj(), From f5618f874750d9f3d518a655368e2bb8dc8ed445 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 07:28:55 -0600 Subject: [PATCH 139/231] sparse/src: Update GS apply docs --- sparse/src/KokkosSparse_gauss_seidel.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 9db3a1b2d3..b7ce643cf9 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -520,7 +520,7 @@ void block_gauss_seidel_numeric( /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. /// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -801,7 +801,7 @@ void symmetric_block_gauss_seidel_apply( /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. /// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -1083,7 +1083,7 @@ void forward_sweep_block_gauss_seidel_apply( /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. /// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix From 987b42fb7a81b3ec8a1a9cc14f5f9392b93b0592 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 07:42:37 -0600 Subject: [PATCH 140/231] .github/workflows: Pin sphinx version --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 33f5f48e39..558b6bd96d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -16,7 +16,7 @@ jobs: - name: Install Dependencies run: | brew install doxygen - python3 -m pip install sphinx + python3 -m pip install sphinx -v "sphinx==6.2.1" python3 -m pip install breathe python3 -m pip install sphinx-rtd-theme sphinx-build --version From e8d809ffc3968906148ef0aaffc4892eb3aaacf8 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 14:05:50 -0600 Subject: [PATCH 141/231] Pass format through --- sparse/src/KokkosSparse_gauss_seidel.hpp | 35 ++++++++++++------------ 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index b7ce643cf9..02faca0729 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -306,8 +306,9 @@ void gauss_seidel_numeric(KernelHandle *handle, scalar_nnz_view_t_ values, bool is_graph_symmetric = true) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, is_graph_symmetric); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + is_graph_symmetric); } /// @@ -457,9 +458,9 @@ void gauss_seidel_numeric(KernelHandle *handle, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric = true) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, given_inverse_diagonal, - is_graph_symmetric); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + given_inverse_diagonal, is_graph_symmetric); } /// @@ -714,10 +715,10 @@ void symmetric_gauss_seidel_apply( bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - symmetric_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, - row_map, entries, values, x_lhs_output_vec, - y_rhs_input_vec, init_zero_x_vector, - update_y_vector, omega, numIter); + symmetric_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -996,10 +997,10 @@ void forward_sweep_gauss_seidel_apply( bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - forward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, - row_map, entries, values, x_lhs_output_vec, - y_rhs_input_vec, init_zero_x_vector, - update_y_vector, omega, numIter); + forward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -1278,10 +1279,10 @@ void backward_sweep_gauss_seidel_apply( bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - backward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, - row_map, entries, values, x_lhs_output_vec, - y_rhs_input_vec, init_zero_x_vector, - update_y_vector, omega, numIter); + backward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// From 956e4c80ffaa23785ece6ef579f46090a2f69262 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 14:54:48 -0600 Subject: [PATCH 142/231] Use ExecutionSpace for user-facing APIs --- docs/developer/apidocs/sparse.rst | 12 +-- sparse/src/KokkosSparse_gauss_seidel.hpp | 111 +++++++++++------------ 2 files changed, 59 insertions(+), 64 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 1f7e702fb4..d35a4eb851 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -63,17 +63,17 @@ gauss_seidel .. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(HandleExecSpace, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) -.. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) -.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 02faca0729..f67d3bd17b 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,13 +29,12 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -45,9 +44,9 @@ namespace Experimental { /// num_rows submatrix of A is structurally symmetric /// @pre handle->create_gs_handle(...) has been called previously /// -template -void gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, +template +void gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -98,11 +97,10 @@ void gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_SYMBOLIC< - ExecSpaceIn, const_handle_type, Internal_alno_row_view_t_, - Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(exec_space_in, - &tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, + ExecutionSpace, const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, is_graph_symmetric); } @@ -175,15 +173,14 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -193,12 +190,12 @@ void block_gauss_seidel_symbolic( /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template -void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, +void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -262,12 +259,12 @@ void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, - &tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, is_graph_symmetric); } @@ -316,7 +313,7 @@ void gauss_seidel_numeric(KernelHandle *handle, /// numeric values). This version accepts the matrix's inverse diagonal from the /// user. /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -324,8 +321,7 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided /// inverse diagonal must share this type. -/// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -339,12 +335,12 @@ void gauss_seidel_numeric(KernelHandle *handle, /// the version of gauss_seidel_numeric that /// doesn't take it as an argument. The inverse diagonal will be /// computed internally. -template -void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, +void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -411,13 +407,12 @@ void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, - &tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, - const_a_d, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, const_a_d, is_graph_symmetric); } @@ -509,7 +504,7 @@ void block_gauss_seidel_numeric( /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -520,7 +515,7 @@ void block_gauss_seidel_numeric( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. -/// @param exec_space_in The execution space instance this kernel will be run +/// @param space The execution space instance this kernel will be run /// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix @@ -538,14 +533,14 @@ void block_gauss_seidel_numeric( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void symmetric_gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -662,14 +657,14 @@ void symmetric_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, - const_a_r, const_a_l, const_a_v, nonconst_x_v, - const_y_v, init_zero_x_vector, update_y_vector, omega, - numIter, true, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, true); } /// @@ -801,7 +796,7 @@ void symmetric_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. -/// @param exec_space_in The execution space instance this kernel will be run +/// @param space The execution space instance this kernel will be run /// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix @@ -819,14 +814,14 @@ void symmetric_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void forward_sweep_gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -945,14 +940,14 @@ void forward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, - const_a_r, const_a_l, const_a_v, nonconst_x_v, - const_y_v, init_zero_x_vector, update_y_vector, omega, - numIter, true, false); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, false); } /// @@ -1072,7 +1067,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -1083,7 +1078,7 @@ void forward_sweep_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. -/// @param exec_space_in The execution space instance this kernel will be run +/// @param space The execution space instance this kernel will be run /// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix @@ -1101,14 +1096,14 @@ void forward_sweep_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void backward_sweep_gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -1227,14 +1222,14 @@ void backward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, - const_a_r, const_a_l, const_a_v, nonconst_x_v, - const_y_v, init_zero_x_vector, update_y_vector, omega, - numIter, false, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + false, true); } /// From c0f199185e7c172c3841a9bbea77c39caff0bfed Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 15:08:32 -0600 Subject: [PATCH 143/231] error check to avoid undefined behavior --- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 98624a4137..649229918d 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -156,6 +156,12 @@ class GaussSeidelHandle { "The type of exec_space_in should be the same as " "GaussSeidelHandle::HandleExecSpace"); this->execution_space = exec_space_in; + } else { + if (exec_space_in != this->execution_space) + throw std::runtime_error( + "Gauss Seidel cannot be called on different execution spaces " + "without multiple handles. Please create a new handle via " + "create_gs_handle.\n"); } is_set = true; } From 492b9aa3a3f534787f7b343e9d382762f9413066 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 23 Aug 2023 07:57:45 -0600 Subject: [PATCH 144/231] Fix intel19 CI failure --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 820d9ae447..a3b4d8ca37 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -740,6 +740,18 @@ void test_gauss_seidel_streams_rank1( lno_t numCols = numRows; typename crsMat_t::value_type m_omega = omega; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same_v) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + std::cerr << "TEST SKIPPED: Not enough concurrency to partition " + "execution space. exec_concurrency: " + << exec_concurrency << std::endl; + return; + } + } +#endif // KOKKOS_ENABLE_OPENMP + std::vector instances; if (nstreams == 1) instances = Kokkos::Experimental::partition_space(execution_space(), 1); From 3af7aace1894fe55499e7929749d73468ea3a5df Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 23 Aug 2023 07:58:19 -0600 Subject: [PATCH 145/231] Add runtime checks to PSGS --- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 1f386a28a4..45f42083a6 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1349,7 +1349,14 @@ class PointGaussSeidel { if (gsHandle->is_symbolic_called() == false) { this->initialize_symbolic(); } - // else + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // else #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif @@ -1718,6 +1725,14 @@ class PointGaussSeidel { if (gsHandle->is_numeric_called() == false) { this->initialize_numeric(); } + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // make sure x and y have been allocated with the correct dimensions nnz_lno_t block_size = gsHandle->get_block_size(); gsHandle->allocate_x_y_vectors(this->num_rows * block_size, From ced63629b784ff5a6cd547b0ed65c1df596fde36 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 23 Aug 2023 13:59:13 -0600 Subject: [PATCH 146/231] Adding exec space instance to spmv (#1932) * Adding exec space instance support to spmv, including cusparse, rocsparse, onemkl wrappers. Fix some misc. bugs too: - Fallback check for cusparse was wrong - we were only calling the TPL for mode 'N', but TPL should be used for 'T' and 'H' also - In SYCL build, forgetting to #undef a macro caused warning in sparse tests - Can't call cuSPARSE or MKL spmv with mode 'H' and real scalars. In this case, just switch to mode 'T' since it's equivalent * Make spmv exec space an independent template param (no longer needs to match AMatrix::execution_space) * Update spmv perf tests that call impl directly * SPMV2D1D, SPMV2D1D_STRUCT -> KokkosSparse::Impl Move them from KokkosSparse:: to KokkosSparse::Impl:: since they're not supposed to be part of the public interface. Keep an alias in KokkosSparse:: for backwards compatibility, but mark it [[deprecated]]. * Fix spmv wrapper for oneMKL * Update doxygen for spmv * Add testing for all CRS spmv interfaces * Add missing template argument * Pass along exec space instance in spmv struct mv * Refactor spmv unification layers Don't list out all the matrix, vector template arguments separately. Instead just use the matrix and vector types. * Test spmv with non-default exec space instances * Fixing spmv build errors with TPLs * spmv test: fix for concurrency = 1 Don't try to partition the exec space in 2 if its concurrency is only 1 --- docs/developer/apidocs/sparse.rst | 6 +- perf_test/sparse/KokkosSparse_spmv_merge.cpp | 6 +- .../KokkosSparse_spmv_struct_tuning.cpp | 14 +- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 174 +-- .../KokkosSparse_spmv_bsrmatrix_impl_v42.hpp | 7 +- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 348 +++--- sparse/impl/KokkosSparse_spmv_impl.hpp | 349 +++--- sparse/impl/KokkosSparse_spmv_spec.hpp | 334 +++--- sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 280 ++--- sparse/impl/KokkosSparse_spmv_struct_spec.hpp | 343 +++--- sparse/src/KokkosSparse_BsrMatrix.hpp | 15 +- sparse/src/KokkosSparse_Utils_cusparse.hpp | 17 + sparse/src/KokkosSparse_Utils_rocsparse.hpp | 18 + sparse/src/KokkosSparse_spmv.hpp | 998 +++++++++++++----- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 128 ++- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 191 ++-- .../KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 30 +- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 75 +- ...okkosSparse_spmv_struct_tpl_spec_avail.hpp | 6 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 109 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 243 +++-- .../Test_Sparse_replaceSumIntoLonger.hpp | 4 +- sparse/unit_test/Test_Sparse_spmv.hpp | 152 ++- 23 files changed, 2268 insertions(+), 1579 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index f73b507439..5129514198 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -38,11 +38,11 @@ crs2coo spmv ---- -.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) -.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) +.. doxygenfunction:: KokkosSparse::spmv(const ExecutionSpace& space, KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(const ExecutionSpace& space, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) .. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) - trsv ---- .. doxygenfunction:: KokkosSparse::trsv diff --git a/perf_test/sparse/KokkosSparse_spmv_merge.cpp b/perf_test/sparse/KokkosSparse_spmv_merge.cpp index 3110223e3c..6ad772116e 100644 --- a/perf_test/sparse/KokkosSparse_spmv_merge.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_merge.cpp @@ -303,8 +303,10 @@ int main(int argc, char** argv) { for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { Kokkos::Timer timer; // KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - KokkosSparse::Impl::spmv_beta(controls, "N", alpha, test_matrix, x, + KokkosSparse::Impl::spmv_beta(Kokkos::DefaultExecutionSpace{}, + controls, "N", alpha, test_matrix, x, beta, y); Kokkos::fence(); double time = timer.seconds(); diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 1290b5814b..02fcd1640a 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -179,7 +179,8 @@ void struct_matvec(const int stencil_type, int64_t worksets_ext = (numInteriorPts + rows_per_team_ext - 1) / rows_per_team_ext; - KokkosSparse::Impl::SPMV_Struct_Functor + KokkosSparse::Impl::SPMV_Struct_Functor spmv_struct(structure, stencil_type, alpha, A, x, beta, y, rows_per_team_int, rows_per_team_ext); @@ -188,8 +189,10 @@ void struct_matvec(const int stencil_type, << ", vector_length=" << vector_length << std::endl; } - spmv_struct.compute_interior(worksets_int, team_size_int, vector_length); - spmv_struct.compute_exterior(worksets_ext, team_size_ext, vector_length); + spmv_struct.compute_interior(execution_space{}, worksets_int, team_size_int, + vector_length); + spmv_struct.compute_exterior(execution_space{}, worksets_ext, team_size_ext, + vector_length); } // struct_matvec @@ -210,8 +213,9 @@ void matvec(typename YVector::const_value_type& alpha, const AMatrix& A, A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length); int64_t worksets = (y.extent(0) + rows_per_team - 1) / rows_per_team; - KokkosSparse::Impl::SPMV_Functor func( - alpha, A, x, beta, y, rows_per_team); + KokkosSparse::Impl::SPMV_Functor + func(alpha, A, x, beta, y, rows_per_team); if (print_lp) { std::cout << "worksets=" << worksets << ", team_size=" << team_size diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index abf44589f7..c2eec04fbc 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -48,7 +48,7 @@ struct BsrMatrixSpMVTensorCoreFunctorParams { /// TEAMS_PER_BLOCK_M and TEAMS_PER_BLOCK_N) if non-zero, statically-known /// launch parameters to reduce the cost of divmod operations on the GPU. If 0, /// provided runtime values will be used instead. -template ; typedef typename AMatrix::device_type Device; - typedef Kokkos::TeamPolicy team_policy; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; typedef typename AMatrix::value_type AScalar; typedef typename YMatrix::value_type YScalar; @@ -181,12 +181,13 @@ struct BsrMatrixSpMVTensorCoreFunctor { } // execute the functor with provided launch parameters - void dispatch() { - typename BsrMatrixSpMVTensorCoreFunctor::team_policy policy(league_size(), - team_size()); + void dispatch(const execution_space &exec) { + typename BsrMatrixSpMVTensorCoreFunctor::team_policy policy( + exec, league_size(), team_size()); policy.set_scratch_size(0, Kokkos::PerTeam(team_scratch_size())); - Kokkos::parallel_for("KokkosSparse::BsrMatrixSpMVTensorCoreFunctor", policy, - *this); + Kokkos::parallel_for( + "KokkosSparse::Experimental::BsrMatrixSpMVTensorCoreFunctor", policy, + *this); } /* @@ -412,7 +413,7 @@ struct BsrMatrixSpMVTensorCoreFunctor { /// This is a struct instead of a function for template...using shorthand /// Discriminates between non-complex/on-GPU (supported) and otherwise /// (unsupported) scalar types, and throws a runtime error for unsupported types -template - using Dyn = BsrMatrixSpMVTensorCoreFunctor; + using Dyn = + BsrMatrixSpMVTensorCoreFunctor; // to be used when the various matrix types are supported - static void tag_dispatch(std::true_type, YScalar alpha, AMatrix a, XMatrix x, + static void tag_dispatch(std::true_type, const execution_space &exec, + const YScalar alpha, AMatrix a, XMatrix x, YScalar beta, YMatrix y) { BsrMatrixSpMVTensorCoreFunctorParams params = Dyn<0, 0, 0>::launch_parameters(alpha, a, x, beta, y); @@ -436,36 +439,36 @@ struct BsrMatrixSpMVTensorCoreDispatcher { if (false) { // consistency of formatting for next sections } else if (1 == params.leagueDim_x && 1 == params.teamsPerBlockM && 1 == params.teamsPerBlockN) { - Dyn<1, 1, 1>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 1, 1>(alpha, a, x, beta, y, params).dispatch(exec); } else if (1 == params.leagueDim_x && 2 == params.teamsPerBlockM && 2 == params.teamsPerBlockN) { - Dyn<1, 2, 2>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 2, 2>(alpha, a, x, beta, y, params).dispatch(exec); } else if (1 == params.leagueDim_x && 4 == params.teamsPerBlockM && 4 == params.teamsPerBlockN) { - Dyn<1, 4, 4>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 4, 4>(alpha, a, x, beta, y, params).dispatch(exec); } else if (1 == params.leagueDim_x && 8 == params.teamsPerBlockM && 8 == params.teamsPerBlockN) { - Dyn<1, 8, 8>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 8, 8>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 1 == params.teamsPerBlockM && 1 == params.teamsPerBlockN) { - Dyn<2, 1, 1>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 1, 1>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 2 == params.teamsPerBlockM && 2 == params.teamsPerBlockN) { - Dyn<2, 2, 2>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 2, 2>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 4 == params.teamsPerBlockM && 4 == params.teamsPerBlockN) { - Dyn<2, 4, 4>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 4, 4>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 8 == params.teamsPerBlockM && 8 == params.teamsPerBlockN) { - Dyn<2, 8, 8>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 8, 8>(alpha, a, x, beta, y, params).dispatch(exec); } else { - Dyn<0, 0, 0>(alpha, a, x, beta, y, params).dispatch(); + Dyn<0, 0, 0>(alpha, a, x, beta, y, params).dispatch(exec); } } // to be used to avoid instantiating on unsupported types - static void tag_dispatch(std::false_type, YScalar, AMatrix, XMatrix, YScalar, - YMatrix) { + static void tag_dispatch(std::false_type, const execution_space &, YScalar, + AMatrix, XMatrix, YScalar, YMatrix) { KokkosKernels::Impl::throw_runtime_exception( "Tensor core SpMV is only supported for non-complex types in GPU " "execution spaces"); @@ -487,15 +490,15 @@ struct BsrMatrixSpMVTensorCoreDispatcher { KokkosKernels::Impl::kk_is_gpu_exec_space(); }; - static void dispatch(YScalar alpha, AMatrix a, XMatrix x, YScalar beta, - YMatrix y) { + static void dispatch(const execution_space &exec, YScalar alpha, AMatrix a, + XMatrix x, YScalar beta, YMatrix y) { // tag will be false unless all conditions are met using tag = std::integral_constant< bool, none_complex::value && all_gpu::value>; - tag_dispatch(tag{}, alpha, a, x, beta, y); + tag_dispatch(tag{}, exec, alpha, a, x, beta, y); } }; @@ -663,6 +666,7 @@ template ()>::type * = nullptr> void spMatVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -672,9 +676,9 @@ void spMatVec_no_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); // // Treat the case y <- alpha * A * x + beta * y @@ -701,14 +705,14 @@ void spMatVec_no_transpose( "KokkosSparse::bspmv", Kokkos::RangePolicy< typename AMatrix_Internal::device_type::execution_space, - Kokkos::Schedule>(0, A.numRows()), + Kokkos::Schedule>(exec, 0, A.numRows()), func); } else { Kokkos::parallel_for( "KokkosSparse::bspmv", Kokkos::RangePolicy< typename AMatrix_Internal::device_type::execution_space, - Kokkos::Schedule>(0, A.numRows()), + Kokkos::Schedule>(exec, 0, A.numRows()), func); } } @@ -723,6 +727,7 @@ template ()>::type * = nullptr> void spMatVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -785,11 +790,11 @@ void spMatVec_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bspmv", policy, func); } else { @@ -798,11 +803,11 @@ void spMatVec_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bspmv", policy, func); } @@ -974,6 +979,7 @@ template ()>::type * = nullptr> void spMatVec_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -983,9 +989,9 @@ void spMatVec_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); if (alpha == Kokkos::ArithTraits::zero()) return; @@ -1033,7 +1039,8 @@ template ()>::type * = nullptr> -void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, +void spMatVec_transpose(const typename AMatrix::execution_space &exec, + const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { @@ -1045,7 +1052,10 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, const auto block_dim = A.blockDim(); - KokkosBlas::scal(y, beta, y); + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule @@ -1092,11 +1102,11 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1104,7 +1114,7 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1113,11 +1123,11 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, func); } else { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1125,7 +1135,7 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1298,6 +1308,7 @@ template ()>::type * = nullptr> void spMatMultiVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -1307,9 +1318,9 @@ void spMatMultiVec_no_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); // // Treat the case y <- alpha * A * x + beta * y // @@ -1357,6 +1368,7 @@ template ()>::type * = nullptr> void spMatMultiVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -1415,15 +1427,15 @@ void spMatMultiVec_no_transpose( if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", policy, func); } else { @@ -1432,20 +1444,19 @@ void spMatMultiVec_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", policy, func); } } /* ******************* */ -template +template struct BSR_GEMM_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; @@ -1622,11 +1633,12 @@ struct BSR_GEMM_Transpose_Functor { /// \brief spMatMultiVec_transpose: version for CPU execution spaces /// (RangePolicy or trivial serial impl used) -template ()>::type * = nullptr> void spMatMultiVec_transpose( + const execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -1636,16 +1648,15 @@ void spMatMultiVec_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); // // Treat the case y <- alpha * A^T * x + beta * y // typedef KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - typedef typename AMatrix_Internal::execution_space execution_space; bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule @@ -1657,19 +1668,20 @@ void spMatMultiVec_transpose( } } - BSR_GEMM_Transpose_Functor func( - alpha, A, x, y, useConjugate); + BSR_GEMM_Transpose_Functor + func(alpha, A, x, y, useConjugate); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::parallel_for( "KokkosSparse::bsr_spm_mv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); } else { Kokkos::parallel_for( "KokkosSparse::bsr_spm_mv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); } } @@ -1677,11 +1689,12 @@ void spMatMultiVec_transpose( // // spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> + execution_space>()>::type * = nullptr> void spMatMultiVec_transpose( + const execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { @@ -1689,9 +1702,10 @@ void spMatMultiVec_transpose( return; } - KokkosBlas::scal(y, beta, y); - - typedef typename AMatrix::execution_space execution_space; + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule @@ -1732,16 +1746,16 @@ void spMatMultiVec_transpose( vector_length = std::stoi(controls.getParameter("vector length")); } - BSR_GEMM_Transpose_Functor func(alpha, A, x, y, - useConjugate); + BSR_GEMM_Transpose_Functor func( + alpha, A, x, y, useConjugate); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * @@ -1749,7 +1763,7 @@ void spMatMultiVec_transpose( else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * @@ -1758,11 +1772,11 @@ void spMatMultiVec_transpose( func); } else { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * @@ -1770,7 +1784,7 @@ void spMatMultiVec_transpose( else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp index 3ac934f5d8..1c0d2fc361 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp @@ -114,11 +114,12 @@ class BsrSpmvV42NonTrans { template -void apply_v42(const Alpha &alpha, const AMatrix &a, const XVector &x, +void apply_v42(const typename AMatrix::execution_space &exec, + const Alpha &alpha, const AMatrix &a, const XVector &x, const Beta &beta, const YVector &y) { - using execution_space = typename YVector::execution_space; + using execution_space = typename AMatrix::execution_space; - Kokkos::RangePolicy policy(0, y.size()); + Kokkos::RangePolicy policy(exec, 0, y.size()); if constexpr (YVector::rank == 1) { // lbv - 07/26/2023: // with_unmanaged_t<...> required Kokkos 4.1.0, diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 69ff744e9d..40bbd2035a 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -33,16 +33,14 @@ namespace Experimental { namespace Impl { // default is no eti available -template +template struct spmv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; -template ::type>::value> + std::is_integral_v> struct spmv_mv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; @@ -51,38 +49,44 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { } // namespace Experimental } // namespace KokkosSparse -#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_bsrmatrix_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_bsrmatrix_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_mv_bsrmatrix_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_bsrmatrix_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include which ETIs are available @@ -95,40 +99,34 @@ namespace Experimental { namespace Impl { // declaration -template ::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_bsrmatrix_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type YScalar; static void spmv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &x, const YScalar &beta, const YVector &y); }; // declaration -template ::type>::value, + std::is_integral_v, bool tpl_spec_avail = spmv_mv_bsrmatrix_tpl_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_mv_bsrmatrix_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_MV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type YScalar; static void spmv_mv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &x, const YScalar &beta, const YVector &y); @@ -142,16 +140,13 @@ constexpr inline const char *ALG_V41 = "v4.1"; constexpr inline const char *ALG_V42 = "v4.2"; constexpr inline const char *ALG_TC = "experimental_bsr_tc"; -template -struct SPMV_BSRMATRIX +struct SPMV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type YScalar; static void spmv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &X, const YScalar &beta, const YVector &Y) { @@ -163,30 +158,29 @@ struct SPMV_BSRMATRIX() || + if (KokkosKernels::Impl::kk_is_gpu_exec_space() || controls.getParameter("algorithm") == ALG_V42) { if (modeIsNoTrans) { - ::KokkosSparse::Impl::apply_v42(alpha, A, X, beta, Y); + ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; } } // fall back to V41 all else fails if (modeIsNoTrans || modeIsConjugate) { - return Bsr::spMatVec_no_transpose(controls, alpha, A, X, beta, Y, + return Bsr::spMatVec_no_transpose(space, controls, alpha, A, X, beta, Y, modeIsConjugate); } else if (modeIsTrans || modeIsConjugateTrans) { - return Bsr::spMatVec_transpose(controls, alpha, A, X, beta, Y, + return Bsr::spMatVec_transpose(space, controls, alpha, A, X, beta, Y, modeIsConjugateTrans); } @@ -200,13 +194,9 @@ struct SPMV_BSRMATRIX -struct SPMV_MV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; enum class Method { @@ -222,6 +212,7 @@ struct SPMV_MV_BSRMATRIX::is_complex) method = Method::Fallback; if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; - // can't use tensor cores outside GPU - if (!KokkosKernels::Impl::kk_is_gpu_exec_space< - typename AMatrix::execution_space>()) - method = Method::Fallback; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space< - typename XVector::execution_space>()) - method = Method::Fallback; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space< - typename YVector::execution_space>()) + // can't use tensor cores outside Nvidia GPU + if constexpr (!std::is_same_v) method = Method::Fallback; // can't use tensor cores unless mode is no-transpose if (mode[0] != KokkosSparse::NoTranspose[0]) method = Method::Fallback; @@ -277,17 +261,17 @@ struct SPMV_MV_BSRMATRIX::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, + X, beta, Y); return; } case Precision::Double: { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, + A, X, beta, Y); return; } case Precision::Automatic: // fallthrough @@ -297,16 +281,14 @@ struct SPMV_MV_BSRMATRIX::value && std::is_same::value; if (operandsHalfHalfFloat) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher< + ExecutionSpace, AMatrix, half, XVector, half, YVector, float, + 16, 16, 16>::dispatch(space, alpha, A, X, beta, Y); return; } else { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher< + ExecutionSpace, AMatrix, double, XVector, double, YVector, + double, 8, 8, 4>::dispatch(space, alpha, A, X, beta, Y); return; } } @@ -319,10 +301,10 @@ struct SPMV_MV_BSRMATRIX::dispatch(alpha, A, - X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, + beta, Y); return; } } @@ -336,30 +318,29 @@ struct SPMV_MV_BSRMATRIX() || + if (KokkosKernels::Impl::kk_is_gpu_exec_space() || controls.getParameter("algorithm") == ALG_V42) { if (modeIsNoTrans) { - ::KokkosSparse::Impl::apply_v42(alpha, A, X, beta, Y); + ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; } } // use V41 as the ultimate fallback if (modeIsNoTrans || modeIsConjugate) { - return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y, - modeIsConjugate); + return Bsr::spMatMultiVec_no_transpose(space, controls, alpha, A, X, beta, + Y, modeIsConjugate); } else if (modeIsTrans || modeIsConjugateTrans) { - return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y, + return Bsr::spMatMultiVec_transpose(space, controls, alpha, A, X, beta, Y, modeIsConjugateTrans); } @@ -373,29 +354,24 @@ struct SPMV_MV_BSRMATRIX -struct SPMV_MV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; static void spmv_mv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &X, const YScalar &beta, const YVector &Y) { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); - typedef SPMV_BSRMATRIX - impl_type; + typedef SPMV_BSRMATRIX impl_type; for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); - impl_type::spmv_bsrmatrix(controls, mode, alpha, A, x_j, beta, y_j); + impl_type::spmv_bsrmatrix(space, controls, mode, alpha, A, x_j, beta, + y_j); } } }; @@ -408,68 +384,80 @@ struct SPMV_MV_BSRMATRIX, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; - -#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_BSRMATRIX< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; // declare / instantiate the 2D MV version // Instantiate with A,x,y are all the requested Scalar type (no instantiation of // mixed-precision operands) -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - extern template struct SPMV_MV_BSRMATRIX< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; - -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_MV_BSRMATRIX< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_MV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; + +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_MV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; #include diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 6a82977e02..d00808558f 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -28,32 +28,10 @@ namespace KokkosSparse { namespace Impl { -template -struct GetCoeffView { - typedef Kokkos::View view_type; - typedef Kokkos::View - non_const_view_type; - static non_const_view_type get_view(const InputType in, const int size) { - non_const_view_type aview("CoeffView", size); - if (size > 0) Kokkos::deep_copy(aview, in); - return aview; - } -}; - -template -struct GetCoeffView, DeviceType> { - typedef Kokkos::View view_type; - static Kokkos::View get_view( - const Kokkos::View& in, int /*size*/) { - return in; - } -}; - // This TransposeFunctor is functional, but not necessarily performant. -template +template struct SPMV_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -110,10 +88,9 @@ struct SPMV_Transpose_Functor { } }; -template +template struct SPMV_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -268,17 +245,17 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, // spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or // trivial serial impl used) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_beta_no_transpose( + const execution_space& exec, const KokkosKernels::Experimental::Controls& controls, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; - typedef typename AMatrix::execution_space execution_space; if (A.numRows() <= static_cast(0)) { return; @@ -375,6 +352,7 @@ static void spmv_beta_no_transpose( (((uintptr_t)(const void*)(y.data()) % 64) == 0) && !conjugate) { // Note BMK: this case is typically not called in practice even for OpenMP, // since it requires row_block_offsets to have been computed in the graph. + // Also, as this is raw OpenMP the execution space instance is not used spmv_raw_openmp_no_transpose(alpha, A, x, beta, y); return; @@ -390,34 +368,34 @@ static void spmv_beta_no_transpose( use_static_schedule = true; } } - SPMV_Functor func(alpha, A, x, - beta, y, 1); + SPMV_Functor + func(alpha, A, x, beta, y, 1); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) Kokkos::parallel_for( "KokkosSparse::spmv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); else Kokkos::parallel_for( "KokkosSparse::spmv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); } // spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_beta_no_transpose( + const execution_space& exec, const KokkosKernels::Experimental::Controls& controls, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; - typedef typename AMatrix::execution_space execution_space; if (A.numRows() <= static_cast(0)) { return; @@ -453,8 +431,8 @@ static void spmv_beta_no_transpose( A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length); int64_t worksets = (y.extent(0) + rows_per_team - 1) / rows_per_team; - SPMV_Functor func( - alpha, A, x, beta, y, rows_per_team); + SPMV_Functor + func(alpha, A, x, beta, y, rows_per_team); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> @@ -462,11 +440,11 @@ static void spmv_beta_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::spmv", policy, func); } else { @@ -475,11 +453,11 @@ static void spmv_beta_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::spmv", policy, func); } @@ -487,17 +465,17 @@ static void spmv_beta_no_transpose( // spmv_beta_transpose: version for CPU execution spaces (RangePolicy or trivial // serial impl used) -template ()>::type* = nullptr> -static void spmv_beta_transpose(typename YVector::const_value_type& alpha, + execution_space>()>::type* = nullptr> +static void spmv_beta_transpose(const execution_space& exec, + typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { - using ordinal_type = typename AMatrix::non_const_ordinal_type; - using size_type = typename AMatrix::non_const_size_type; - using execution_space = typename AMatrix::execution_space; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows() <= static_cast(0)) { return; @@ -506,13 +484,13 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } #if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || \ defined(KOKKOS_ENABLE_THREADS) { - if (execution_space().concurrency() == 1) { + if (exec.concurrency() == 1) { /// serial impl typedef typename AMatrix::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; @@ -567,25 +545,27 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, } #endif - typedef SPMV_Transpose_Functor OpType; + typedef SPMV_Transpose_Functor + OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), + Kokkos::RangePolicy(exec, 0, nrow), OpType(alpha, A, x, y)); } // spmv_beta_transpose: version for GPU execution spaces (TeamPolicy used) -template ()>::type* = nullptr> -static void spmv_beta_transpose(typename YVector::const_value_type& alpha, + execution_space>()>::type* = nullptr> +static void spmv_beta_transpose(const execution_space& exec, + typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { - using ordinal_type = typename AMatrix::non_const_ordinal_type; - using size_type = typename AMatrix::non_const_size_type; - using execution_space = typename AMatrix::execution_space; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows() <= static_cast(0)) { return; @@ -594,7 +574,7 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } // Assuming that no row contains duplicate entries, NNZPerRow @@ -616,7 +596,9 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, (vector_length < max_vector_length)) vector_length *= 2; - typedef SPMV_Transpose_Functor OpType; + typedef SPMV_Transpose_Functor + OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -625,37 +607,39 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::TeamPolicy(nteams, team_size, vector_length), - op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), + op); } -template -static void spmv_beta(const KokkosKernels::Experimental::Controls& controls, +template +static void spmv_beta(const execution_space& exec, + const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_beta_no_transpose( - controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); } else if (mode[0] == Conjugate[0]) { - spmv_beta_no_transpose( - controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); } else if (mode[0] == Transpose[0]) { - spmv_beta_transpose(alpha, A, x, - beta, y); + spmv_beta_transpose(exec, alpha, A, x, beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_beta_transpose(alpha, A, x, - beta, y); + spmv_beta_transpose(exec, alpha, A, x, beta, y); } else { KokkosKernels::Impl::throw_runtime_exception( "Invalid Transpose Mode for KokkosSparse::spmv()"); @@ -665,10 +649,9 @@ static void spmv_beta(const KokkosKernels::Experimental::Controls& controls, // Functor for implementing transpose and conjugate transpose sparse // matrix-vector multiply with multivector (2-D View) input and // output. This functor works, but is not necessarily performant. -template +template struct SPMV_MV_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -771,10 +754,9 @@ struct SPMV_MV_Transpose_Functor { } }; -template +template struct SPMV_MV_LayoutLeft_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -1172,11 +1154,12 @@ struct SPMV_MV_LayoutLeft_Functor { // spmv_alpha_beta_mv_no_transpose: version for CPU execution spaces // (RangePolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1187,7 +1170,7 @@ static void spmv_alpha_beta_mv_no_transpose( } if (doalpha == 0) { if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } return; } else { @@ -1200,46 +1183,45 @@ static void spmv_alpha_beta_mv_no_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #endif // KOKKOS_FAST_COMPILE } } // spmv_alpha_beta_mv_no_transpose: version for GPU execution spaces // (TeamPolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1251,7 +1233,7 @@ static void spmv_alpha_beta_mv_no_transpose( } if (doalpha == 0) { if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } return; } else { @@ -1267,63 +1249,62 @@ static void spmv_alpha_beta_mv_no_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } // spmv_alpha_beta_mv_transpose: version for CPU execution spaces (RangePolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1336,45 +1317,46 @@ static void spmv_alpha_beta_mv_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } if (doalpha != 0) { #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; OpType op(alpha, A, x, beta, y); const ordinal_type nrow = A.numRows(); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #endif // KOKKOS_FAST_COMPILE } } // spmv_alpha_beta_mv_transpose: version for GPU execution spaces (TeamPolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1388,7 +1370,7 @@ static void spmv_alpha_beta_mv_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } if (doalpha != 0) { @@ -1406,79 +1388,86 @@ static void spmv_alpha_beta_mv_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; OpType op(alpha, A, x, beta, y); const ordinal_type nrow = A.numRows(); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); OpType op(alpha, A, x, beta, y); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } -template +template static void spmv_alpha_beta_mv( - const char mode[], const typename YVector::non_const_value_type& alpha, - const AMatrix& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { + const execution_space& exec, const char mode[], + const typename YVector::non_const_value_type& alpha, const AMatrix& A, + const XVector& x, const typename YVector::non_const_value_type& beta, + const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_alpha_beta_mv_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_no_transpose(exec, alpha, A, x, + beta, y); } else if (mode[0] == Conjugate[0]) { - spmv_alpha_beta_mv_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_no_transpose(exec, alpha, A, x, + beta, y); } else if (mode[0] == Transpose[0]) { - spmv_alpha_beta_mv_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_transpose(exec, alpha, A, x, + beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_alpha_beta_mv_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_transpose(exec, alpha, A, x, beta, + y); } else { KokkosKernels::Impl::throw_runtime_exception( "Invalid Transpose Mode for KokkosSparse::spmv()"); } } -template -void spmv_alpha_mv(const char mode[], +template +void spmv_alpha_mv(const execution_space& exec, const char mode[], const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1487,17 +1476,17 @@ void spmv_alpha_mv(const char mode[], typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { - spmv_alpha_beta_mv(mode, alpha, A, x, - beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_alpha_beta_mv(mode, alpha, A, x, - beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_alpha_beta_mv(mode, alpha, A, - x, beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } else { - spmv_alpha_beta_mv(mode, alpha, A, x, - beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } } diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index 95cd022159..1d509e2aed 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -30,15 +30,13 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_eti_spec_avail { enum : bool { value = false }; }; -template ::type>::value> + std::is_integral_v> struct spmv_mv_eti_spec_avail { enum : bool { value = false }; }; @@ -46,38 +44,44 @@ struct spmv_mv_eti_spec_avail { } // namespace Impl } // namespace KokkosSparse -#define KOKKOSSPARSE_SPMV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct spmv_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct spmv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct spmv_mv_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -94,33 +98,19 @@ namespace Impl { /// \brief Implementation of KokkosSparse::spmv (sparse matrix - dense /// vector multiply) for single vectors (1-D Views). /// -/// The first 5 template parameters are the same as those of -/// KokkosSparse::CrsMatrix. In particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The last 4 template parameters (that start -/// with Y) correspond to the output Kokkos::View. -/// /// For the implementation of KokkosSparse::spmv for multivectors (2-D /// Views), see the SPMV_MV struct below. -template ::value, - bool eti_spec_avail = spmv_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> +template < + class ExecutionSpace, class AMatrix, class XVector, class YVector, + bool tpl_spec_avail = + spmv_tpl_spec_avail::value, + bool eti_spec_avail = + spmv_eti_spec_avail::value> struct SPMV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; - typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& controls, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -146,39 +136,22 @@ struct SPMV { /// matrix, and Op(A) is either A itself, its transpose, or its /// conjugate transpose, depending on the 'mode' argument. /// -/// The first 5 template parameters are the template parameters of the -/// input 1-D View of coefficients 'alpha'. The next 5 template -/// parameters are the same as those of KokkosSparse::CrsMatrix. In -/// particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The 4 template parameters after that -/// (that start with lower-case b) are the template parameters of the -/// input 1-D View of coefficients 'beta'. Next, the 5 template -/// parameters that start with Y correspond to the output -/// Kokkos::View. The last template parameter indicates whether the +/// The last template parameter (integerScalarType) indicates whether the /// matrix's entries have integer type. Per Github Issue #700, we /// don't optimize as heavily for that case, in order to reduce build /// times and library sizes. -template ::type>::value, - bool tpl_spec_avail = spmv_mv_tpl_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value, - bool eti_spec_avail = spmv_mv_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + std::is_integral_v, + bool tpl_spec_avail = spmv_mv_tpl_spec_avail::value, + bool eti_spec_avail = spmv_mv_eti_spec_avail::value> struct SPMV_MV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const KokkosKernels::Experimental::Controls& controls, + static void spmv_mv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& controls, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -187,16 +160,13 @@ struct SPMV_MV { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spmv for single vectors (1-D Views). // Unification layer -template -struct SPMV +struct SPMV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& controls, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { @@ -204,79 +174,75 @@ struct SPMV(controls, mode, alpha, A, x, beta, - y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_beta(controls, mode, alpha, A, x, beta, - y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_beta(controls, mode, alpha, A, x, - beta, y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } else { - spmv_beta(controls, mode, alpha, A, x, beta, - y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } } }; //! Full specialization of spmv_mv for single vectors (2-D Views). // Unification layer -template -struct SPMV_MV +struct SPMV_MV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& /*controls*/, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } else if (alpha == KAT::one()) { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } else if (alpha == -KAT::one()) { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } else { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } } }; -template -struct SPMV_MV +struct SPMV_MV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& /*controls*/, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); - typedef SPMV - impl_type; + typedef SPMV impl_type; KokkosKernels::Experimental::Controls defaultControls; for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) { auto x_j = Kokkos::subview(x, Kokkos::ALL(), j); auto y_j = Kokkos::subview(y, Kokkos::ALL(), j); - impl_type::spmv(defaultControls, mode, alpha, A, x_j, beta, y_j); + impl_type::spmv(space, defaultControls, mode, alpha, A, x_j, beta, y_j); } } }; @@ -292,65 +258,77 @@ struct SPMV_MV, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SPMV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct SPMV< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SPMV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - extern template struct SPMV_MV< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SPMV_MV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct SPMV_MV< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SPMV_MV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; #include diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 8f217e05aa..dc3c592632 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -29,10 +29,9 @@ namespace Impl { enum { FD, FE }; // This TransposeFunctor is functional, but not necessarily performant. -template +template struct SPMV_Struct_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -91,13 +90,12 @@ struct SPMV_Struct_Transpose_Functor { } }; -template +template struct SPMV_Struct_Functor { typedef typename AMatrix::non_const_size_type size_type; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; - typedef typename AMatrix::execution_space execution_space; typedef typename execution_space::scratch_memory_space scratch_space; typedef typename KokkosSparse::SparseRowViewConst row_view_const; typedef typename Kokkos::TeamPolicy team_policy; @@ -164,8 +162,8 @@ struct SPMV_Struct_Functor { } } - void compute_interior(const int64_t worksets, const int team_size, - const int vector_length) { + void compute_interior(const execution_space& exec, const int64_t worksets, + const int team_size, const int vector_length) { if (numDimensions == 1) { // Treat interior points using structured algorithm numInterior = ni - 2; @@ -173,16 +171,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(3); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -198,16 +196,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(5); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -217,16 +215,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(9); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -243,16 +241,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(7); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -262,16 +260,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(27); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -548,15 +546,15 @@ struct SPMV_Struct_Functor { }); } - void compute_exterior(const int64_t worksets, const int team_size, - const int vector_length) { + void compute_exterior(const execution_space& exec, const int64_t worksets, + const int team_size, const int vector_length) { // Treat exterior points using unstructured algorithm if (numDimensions == 1) { numExterior = 2; if (numExterior > 0) { Kokkos::RangePolicy > - policy(0, numExterior); + policy(exec, 0, numExterior); Kokkos::parallel_for( "KokkosSparse::spmv_struct: exterior", policy, *this); @@ -567,15 +565,15 @@ struct SPMV_Struct_Functor { if (numExterior > 0) { Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); } Kokkos::parallel_for( "KokkosSparse::spmv_struct: exterior", policy, @@ -587,15 +585,15 @@ struct SPMV_Struct_Functor { if (numExterior > 0) { Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); } Kokkos::parallel_for( @@ -773,17 +771,16 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, return rows_per_team; } // spmv_struct_launch_parameters -template +template static void spmv_struct_beta_no_transpose( - const int stencil_type, + const execution_space& exec, const int stencil_type, const Kokkos::View& structure, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { typedef typename AMatrix::ordinal_type ordinal_type; - typedef typename AMatrix::execution_space execution_space; if (A.numRows() <= static_cast(0)) { return; } @@ -833,18 +830,21 @@ static void spmv_struct_beta_no_transpose( int64_t worksets_exterior = (numExteriorPts + rows_per_team_ext - 1) / rows_per_team_ext; - SPMV_Struct_Functor spmv_struct( - structure, stencil_type, alpha, A, x, beta, y, rows_per_team_int, - rows_per_team_ext); + SPMV_Struct_Functor + spmv_struct(structure, stencil_type, alpha, A, x, beta, y, + rows_per_team_int, rows_per_team_ext); - spmv_struct.compute_interior(worksets_interior, team_size_int, vector_length); - spmv_struct.compute_exterior(worksets_exterior, team_size_ext, vector_length); + spmv_struct.compute_interior(exec, worksets_interior, team_size_int, + vector_length); + spmv_struct.compute_exterior(exec, worksets_exterior, team_size_ext, + vector_length); } // spmv_struct_beta_no_transpose -template +template static void spmv_struct_beta_transpose( - const int /*stencil_type*/, + const execution_space& exec, const int /*stencil_type*/, const Kokkos::View& /*structure*/, typename YVector::const_value_type& alpha, const AMatrix& A, @@ -859,7 +859,7 @@ static void spmv_struct_beta_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } typedef typename AMatrix::size_type size_type; @@ -875,49 +875,52 @@ static void spmv_struct_beta_transpose( (vector_length < 32)) vector_length *= 2; - typedef SPMV_Struct_Transpose_Functor + typedef SPMV_Struct_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow)); + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow)); - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); } -template +template static void spmv_struct_beta( - const char mode[], const int stencil_type, + const execution_space& exec, const char mode[], const int stencil_type, const Kokkos::View& structure, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_struct_beta_no_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_no_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else if (mode[0] == Conjugate[0]) { - spmv_struct_beta_no_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_no_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else if (mode[0] == Transpose[0]) { - spmv_struct_beta_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_struct_beta_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else { KokkosKernels::Impl::throw_runtime_exception( "Invalid Transpose Mode for KokkosSparse::spmv_struct()"); @@ -927,10 +930,9 @@ static void spmv_struct_beta( // Functor for implementing transpose and conjugate transpose sparse // matrix-vector multiply with multivector (2-D View) input and // output. This functor works, but is not necessarily performant. -template +template struct SPMV_MV_Struct_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -1007,10 +1009,9 @@ struct SPMV_MV_Struct_Transpose_Functor { } }; -template +template struct SPMV_MV_Struct_LayoutLeft_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -1245,9 +1246,10 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { } }; -template +template static void spmv_alpha_beta_mv_struct_no_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1258,7 +1260,7 @@ static void spmv_alpha_beta_mv_struct_no_transpose( } if (doalpha == 0) { if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } return; } else { @@ -1278,11 +1280,10 @@ static void spmv_alpha_beta_mv_struct_no_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_LayoutLeft_Functor + typedef SPMV_MV_Struct_LayoutLeft_Functor< + execution_space, AMatrix, XVector, YVector, doalpha, dobeta, conjugate> OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -1292,30 +1293,28 @@ static void spmv_alpha_beta_mv_struct_no_transpose( // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Struct_LayoutLeft_Functor + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here @@ -1323,25 +1322,25 @@ static void spmv_alpha_beta_mv_struct_no_transpose( // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } -template +template static void spmv_alpha_beta_mv_struct_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1354,7 +1353,7 @@ static void spmv_alpha_beta_mv_struct_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } if (doalpha != 0) { @@ -1374,11 +1373,10 @@ static void spmv_alpha_beta_mv_struct_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_Transpose_Functor + typedef SPMV_MV_Struct_Transpose_Functor< + execution_space, AMatrix, XVector, YVector, doalpha, dobeta, conjugate> OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow)); + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow)); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -1387,78 +1385,82 @@ static void spmv_alpha_beta_mv_struct_transpose( // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Struct_Transpose_Functor + typedef SPMV_MV_Struct_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow)); + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow)); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } -template +template static void spmv_alpha_beta_mv_struct( - const char mode[], const typename YVector::non_const_value_type& alpha, - const AMatrix& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { + const execution_space& exec, const char mode[], + const typename YVector::non_const_value_type& alpha, const AMatrix& A, + const XVector& x, const typename YVector::non_const_value_type& beta, + const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_alpha_beta_mv_struct_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_no_transpose( + exec, alpha, A, x, beta, y); } else if (mode[0] == Conjugate[0]) { - spmv_alpha_beta_mv_struct_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_no_transpose( + exec, alpha, A, x, beta, y); } else if (mode[0] == Transpose[0]) { - spmv_alpha_beta_mv_struct_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_transpose( + exec, alpha, A, x, beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_alpha_beta_mv_struct_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_transpose( + exec, alpha, A, x, beta, y); } else { KokkosKernels::Impl::throw_runtime_exception( "Invalid Transpose Mode for KokkosSparse::spmv()"); } } -template -void spmv_alpha_mv_struct(const char mode[], +template +void spmv_alpha_mv_struct(const execution_space& exec, const char mode[], const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1467,17 +1469,17 @@ void spmv_alpha_mv_struct(const char mode[], typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } else { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } } diff --git a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp index 9b22278db2..103bea8781 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp @@ -29,16 +29,14 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_struct_eti_spec_avail { enum : bool { value = false }; }; -template ::type>::value> + std::is_integral_v> struct spmv_mv_struct_eti_spec_avail { enum : bool { value = false }; }; @@ -46,38 +44,44 @@ struct spmv_mv_struct_eti_spec_avail { } // namespace Impl } // namespace KokkosSparse -#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_struct_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_struct_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_mv_struct_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_struct_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -92,34 +96,18 @@ namespace Impl { /// \brief Implementation of KokkosSparse::spmv_struct (sparse structured matrix /// - dense vector multiply) for single vectors (1-D Views). /// -/// The first 5 template parameters are the same as those of -/// KokkosSparse::CrsMatrix. In particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The last 4 template parameters (that start -/// with Y) correspond to the output Kokkos::View. -/// /// For the implementation of KokkosSparse::spmv_struct for multivectors (2-D /// Views), see the SPMV_STRUCT struct below. -template ::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_struct_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; - typedef typename YVector::non_const_value_type coefficient_type; static void spmv_struct( - const char mode[], const int stencil_type, + const ExecutionSpace& space, const char mode[], const int stencil_type, const Kokkos::View& structure, const coefficient_type& alpha, const AMatrix& A, const XVector& x, @@ -146,57 +134,36 @@ struct SPMV_STRUCT { /// matrix, and Op(A) is either A itself, its transpose, or its /// conjugate transpose, depending on the 'mode' argument. /// -/// The first 5 template parameters are the template parameters of the -/// input 1-D View of coefficients 'alpha'. The next 5 template -/// parameters are the same as those of KokkosSparse::CrsMatrix. In -/// particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The 4 template parameters after that -/// (that start with lower-case b) are the template parameters of the -/// input 1-D View of coefficients 'beta'. Next, the 4 template -/// parameters that start with Y correspond to the output -/// Kokkos::View. The last template parameter indicates whether the +/// The last template parameter integerScalarType indicates whether the /// matrix's entries have integer type. Per Github Issue #700, we /// don't optimize as heavily for that case, in order to reduce build /// times and library sizes. -template ::type>::value, + std::is_integral_v, bool tpl_spec_avail = spmv_mv_struct_tpl_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_mv_struct_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_MV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv_struct(const char mode[], const coefficient_type& alpha, - const AMatrix& A, const XVector& x, - const coefficient_type& beta, const YVector& y); + static void spmv_mv_struct(const ExecutionSpace& space, const char mode[], + const coefficient_type& alpha, const AMatrix& A, + const XVector& x, const coefficient_type& beta, + const YVector& y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spmv for single vectors (1-D Views). // Unification layer -template -struct SPMV_STRUCT +struct SPMV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; static void spmv_struct( - const char mode[], const int stencil_type, + const ExecutionSpace& space, const char mode[], const int stencil_type, const Kokkos::View& structure, const coefficient_type& alpha, const AMatrix& A, const XVector& x, @@ -207,80 +174,72 @@ struct SPMV_STRUCT( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_struct_beta( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_struct_beta( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } else { - spmv_struct_beta( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } } }; //! Full specialization of spmv_mv for single vectors (2-D Views). // Unification layer -template -struct SPMV_MV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_STRUCT { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv_struct(const char mode[], const coefficient_type& alpha, - const AMatrix& A, const XVector& x, - const coefficient_type& beta, const YVector& y) { + static void spmv_mv_struct(const ExecutionSpace& space, const char mode[], + const coefficient_type& alpha, const AMatrix& A, + const XVector& x, const coefficient_type& beta, + const YVector& y) { typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } else if (alpha == KAT::one()) { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } else if (alpha == -KAT::one()) { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } else { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } } }; -template -struct SPMV_MV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_STRUCT { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv_struct(const char mode[], const coefficient_type& alpha, - const AMatrix& A, const XVector& x, - const coefficient_type& beta, const YVector& y) { - static_assert(std::is_integral::value, + static void spmv_mv_struct(const ExecutionSpace& space, const char mode[], + const coefficient_type& alpha, const AMatrix& A, + const XVector& x, const coefficient_type& beta, + const YVector& y) { + static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); - typedef SPMV_STRUCT - impl_type; + typedef SPMV_STRUCT impl_type; for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) { auto x_j = Kokkos::subview(x, Kokkos::ALL(), j); auto y_j = Kokkos::subview(y, Kokkos::ALL(), j); - impl_type::spmv_struct(mode, alpha, A, x_j, beta, y_j); + impl_type::spmv_struct(space, mode, alpha, A, x_j, beta, y_j); } } }; @@ -296,65 +255,77 @@ struct SPMV_MV_STRUCT, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_STRUCT< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - extern template struct SPMV_MV_STRUCT< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_MV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_MV_STRUCT< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_MV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; #include diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index 8b789f66f3..e0d6e61a3b 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -463,7 +463,8 @@ class BsrMatrix { blockDim_(blockDimIn) { if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } } @@ -499,7 +500,8 @@ class BsrMatrix { if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -661,7 +663,8 @@ class BsrMatrix { blockDim_(blockDimIn) { if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -703,7 +706,8 @@ class BsrMatrix { : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) { if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } } @@ -724,7 +728,8 @@ class BsrMatrix { blockDim_ = blockDimIn; if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } diff --git a/sparse/src/KokkosSparse_Utils_cusparse.hpp b/sparse/src/KokkosSparse_Utils_cusparse.hpp index 65f6ac9689..55e7144dba 100644 --- a/sparse/src/KokkosSparse_Utils_cusparse.hpp +++ b/sparse/src/KokkosSparse_Utils_cusparse.hpp @@ -168,6 +168,23 @@ inline cusparseIndexType_t cusparse_index_type_t_from() { } #endif +// Set the stream on the given cuSPARSE handle when this object +// is constructed, and reset to the default stream when this object is +// destructed. +struct TemporarySetCusparseStream { + TemporarySetCusparseStream(cusparseHandle_t handle_, + const Kokkos::Cuda& exec_) + : handle(handle_) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(handle, exec_.cuda_stream())); + } + + ~TemporarySetCusparseStream() { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(handle, NULL)); + } + + cusparseHandle_t handle; +}; + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index e263dfd0fa..6f79844782 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -178,6 +178,24 @@ struct kokkos_to_rocsparse_type> { #define KOKKOSSPARSE_IMPL_ROCM_VERSION \ ROCM_VERSION_MAJOR * 10000 + ROCM_VERSION_MINOR * 100 + ROCM_VERSION_PATCH +// Set the stream on the given rocSPARSE handle when this object +// is constructed, and reset to the default stream when this object is +// destructed. +struct TemporarySetRocsparseStream { + TemporarySetRocsparseStream(rocsparse_handle handle_, + const Kokkos::HIP& exec_) + : handle(handle_) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_set_stream(handle, exec_.hip_stream())); + } + + ~TemporarySetRocsparseStream() { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(handle, NULL)); + } + + rocsparse_handle handle; +}; + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index deb1c75bf0..2aab1cef60 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -40,39 +40,67 @@ struct RANK_ONE {}; struct RANK_TWO {}; } // namespace -/// \brief Tag-dispatch for \c Kokkos sparse matrix-vector multiply on single -/// vector +/// \brief Kokkos sparse matrix-vector multiply on single +/// vectors (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). /// +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-1 Kokkos::View and its rank must match that of XVector /// -/// \tparam AMatrix A KokkosSparse::CrsMatrix, or KokkosSparse::BsrMatrix -/// +/// \param space [in] The execution space instance on which to run the +/// kernel. /// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] -/// \param alpha [in] Scalar multiplier for the matrix A. -/// \param A [in] The sparse matrix A. -/// \param x [in] A vector. -/// \param beta [in] Scalar multiplier for the multivector y. -/// \param y [in/out] vector. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. /// \param tag RANK_ONE dispatch -/// -#ifdef DOXY // documentation version -template +#ifdef DOXY // documentation version - don't separately document SFINAE + // specializations for BSR and CRS +template #else -template ::value>::type* = nullptr> #endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, [[maybe_unused]] const RANK_ONE& tag) { - // Make sure that x and y have the same rank. + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); + + // Make sure that x and y have the same rank. + static_assert(XVector::rank == YVector::rank, + "KokkosSparse::spmv: Vector ranks do not match."); // Make sure that x (and therefore y) is rank 1. static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " @@ -136,9 +164,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y_i, beta, y_i); + KokkosBlas::scal(space, y_i, beta, y_i); return; } @@ -148,18 +176,12 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], (controls.getParameter("algorithm") != "tpl"); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only - // supports the normal (N) mode. - if (std::is_same::value || - std::is_same::value) { -#if (9000 <= CUDA_VERSION) - useFallback = useFallback || (mode[0] != NoTranspose[0]); -#endif -#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + // cuSPARSE does not support the conjugate mode (C) + if constexpr (std::is_same_v || + std::is_same_v) { useFallback = useFallback || (mode[0] == Conjugate[0]); -#endif } // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for all // scalar types @@ -197,62 +219,85 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix_Internal::non_const_value_type>::name() + "]"; Kokkos::Profiling::pushRegion(label); - Impl::SPMV::spmv(controls, - mode, - alpha, - A_i, x_i, - beta, - y_i); + Impl::SPMV::spmv(space, controls, mode, alpha, A_i, + x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { // note: the cuSPARSE spmv wrapper defines a profiling region, so one is not // needed here. - Impl::SPMV::spmv(controls, mode, - alpha, A_i, x_i, - beta, y_i); + Impl::SPMV::spmv(space, controls, mode, alpha, A_i, x_i, + beta, y_i); } } -#ifdef DOXY // hide SFINAE +/// \brief Kokkos sparse matrix-vector multiply on single +/// vector (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). +/// +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-1 Kokkos::View and its rank must match that of XVector +/// +/// \param controls [in] kokkos-kernels control structure. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +/// \param tag RANK_ONE dispatch +#ifdef DOXY // documentation version template #else template ::value>::type* = nullptr> + typename std::enable_if< + KokkosSparse::is_crs_matrix::value>::type* = nullptr> #endif void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE) { - // Make sure that x and y have the same rank. + const BetaType& beta, const YVector& y, const RANK_ONE& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} + +#ifndef DOXY // hide SFINAE specialization for BSR +template ::value>::type* = nullptr> +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_ONE& tag) { + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); + // Make sure that x and y have the same rank. + static_assert(XVector::rank == YVector::rank, + "KokkosSparse::spmv: Vector ranks do not match."); // Make sure that x (and therefore y) is rank 1. static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " @@ -269,7 +314,8 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix::device_type, Kokkos::MemoryTraits, typename AMatrix::size_type> Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_ONE()); + KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, + RANK_ONE()); return; } // Check compatibility of dimensions at run time. @@ -333,9 +379,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y_i, beta, y_i); + KokkosBlas::scal(space, y_i, beta, y_i); return; } @@ -378,66 +424,58 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix_Internal::non_const_value_type>::name() + "]"; Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_BSRMATRIX< - typename AMatrix_Internal::const_value_type, - typename AMatrix_Internal::const_ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::const_size_type, - typename XVector_Internal::const_value_type*, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type*, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, - false>::spmv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + Experimental::Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, controls, + mode, alpha, A_i, + x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { -#define __SPMV_TYPES__ \ - typename AMatrix_Internal::const_value_type, \ - typename AMatrix_Internal::const_ordinal_type, \ - typename AMatrix_Internal::device_type, \ - typename AMatrix_Internal::memory_traits, \ - typename AMatrix_Internal::const_size_type, \ - typename XVector_Internal::const_value_type*, \ - typename XVector_Internal::array_layout, \ - typename XVector_Internal::device_type, \ - typename XVector_Internal::memory_traits, \ - typename YVector_Internal::value_type*, \ - typename YVector_Internal::array_layout, \ - typename YVector_Internal::device_type, \ - typename YVector_Internal::memory_traits - constexpr bool tpl_spec_avail = KokkosSparse::Experimental::Impl::spmv_bsrmatrix_tpl_spec_avail< - __SPMV_TYPES__>::value; + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::value; constexpr bool eti_spec_avail = tpl_spec_avail ? KOKKOSKERNELS_IMPL_COMPILE_LIBRARY /* force FALSE in app/test */ : KokkosSparse::Experimental::Impl::spmv_bsrmatrix_eti_spec_avail< - __SPMV_TYPES__>::value; + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::value; - Experimental::Impl::SPMV_BSRMATRIX<__SPMV_TYPES__, tpl_spec_avail, - eti_spec_avail>::spmv_bsrmatrix(controls, - mode, - alpha, - A_i, x_i, - beta, - y_i); - -#undef __SPMV_TYPES__ + Experimental::Impl::SPMV_BSRMATRIX< + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, + tpl_spec_avail, eti_spec_avail>::spmv_bsrmatrix(space, controls, mode, + alpha, A_i, x_i, beta, + y_i); } } +template ::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} +#endif // ifndef DOXY + +namespace Impl { template struct SPMV2D1D { static bool spmv2d1d(const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y); + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y); }; #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) @@ -448,10 +486,22 @@ struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); return true; } +}; + #else + template struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } }; +#endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY) template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); return true; } +}; + #else + template struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } }; +#endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY) template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + #else + template struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } }; +#endif +} // namespace Impl -/// \brief Tag-dispatch sparse matrix-vector multiply on multivectors +template +using SPMV2D1D + [[deprecated("KokkosSparse::SPMV2D1D is not part of the public interface - " + "use KokkosSparse::spmv instead")]] = + Impl::SPMV2D1D; + +/// \brief Kokkos sparse matrix-vector multiply on multivectors +/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). /// -/// \tparam AMatrix A KokkosSparse::CrsMatrix, -/// KokkosSparse::Experimental::BsrMatrix +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector /// +/// \param space [in] The execution space instance on which to run the +/// kernel. /// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] \c "N" for no transpose -/// \param alpha [in] Scalar multiplier for the matrix A. -/// \param A [in] The sparse matrix A. -/// \param x [in] A multivector (rank-2 Kokkos::View). -/// \param beta [in] Scalar multiplier for the multivector y. -/// \param y [in/out] multivector (exrank-2 Kokkos::View). +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. /// \param tag RANK_TWO dispatch -/// -#ifdef DOXY -template +#ifdef DOXY // documentation version +template #else -template ::value>::type* = nullptr> #endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, [[maybe_unused]] const RANK_TWO& tag) { - - // Make sure that x and y have the same rank. + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); + // Make sure that x and y have the same rank. + static_assert(XVector::rank == YVector::rank, + "KokkosSparse::spmv: Vector ranks do not match."); // Make sure that x (and therefore y) is rank 2. static_assert(static_cast(XVector::rank) == 2, "KokkosSparse::spmv: Both Vector inputs must have rank 2 " @@ -604,10 +736,11 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0); // spmv (mode, alpha, A, x_i, beta, y_i); - using impl_type = SPMV2D1D; - if (impl_type::spmv2d1d(mode, alpha, A, x_i, beta, y_i)) { + using impl_type = + Impl::SPMV2D1D; + if (impl_type::spmv2d1d(space, mode, alpha, A, x_i, beta, y_i)) { return; } } @@ -638,54 +771,81 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], if (useNative) { return Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral::value, - false>::spmv_mv(controls, mode, alpha, A_i, x_i, beta, y_i); + false>::spmv_mv(space, controls, mode, alpha, A_i, x_i, beta, y_i); } else { - return Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv(controls, mode, - alpha, A_i, x_i, - beta, y_i); + return Impl::SPMV_MV::spmv_mv(space, controls, mode, + alpha, A_i, x_i, beta, + y_i); } } } -#ifdef DOXY // hide SFINAE +/// \brief Kokkos sparse matrix-vector multiply on multivectors +/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). +/// +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector +/// +/// \param controls [in] kokkos-kernels control structure. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +/// \param tag RANK_TWO dispatch +#ifdef DOXY template #else template ::value>::type* = nullptr> + typename std::enable_if< + KokkosSparse::is_crs_matrix::value>::type* = nullptr> #endif void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO) { + const BetaType& beta, const YVector& y, const RANK_TWO& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} + +#ifndef DOXY // hide SFINAE +template ::value>::type* = nullptr> +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_TWO& tag) { + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -706,7 +866,8 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix::device_type, Kokkos::MemoryTraits, typename AMatrix::size_type> Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_TWO()); + KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, + RANK_TWO()); return; } // Check compatibility of dimensions at run time. @@ -769,9 +930,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y_i, beta, y_i); + KokkosBlas::scal(space, y_i, beta, y_i); return; } // @@ -793,7 +954,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0); YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0); - return spmv(controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); + return spmv(space, controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); } // // Whether to call KokkosKernel's native implementation, even if a TPL impl is @@ -827,45 +988,34 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], "]"; Kokkos::Profiling::pushRegion(label); Experimental::Impl::SPMV_MV_BSRMATRIX< - typename AMatrix_Internal::const_value_type, - typename AMatrix_Internal::const_ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::const_size_type, - typename XVector_Internal::const_value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral::value, - false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + false>::spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, + y_i); Kokkos::Profiling::popRegion(); } else { Experimental::Impl::SPMV_MV_BSRMATRIX< - typename AMatrix_Internal::const_value_type, - typename AMatrix_Internal::const_ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::const_size_type, - typename XVector_Internal::const_value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral::value>:: - spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, y_i); } } +template ::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} +#endif + /// \brief Public interface to local sparse matrix-vector multiply. /// -/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both +/// Compute y := beta*y + alpha*Op(A)*x, where x and y are either both /// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View /// instances, and Op(A) is determined /// by \c mode. If beta == 0, ignore and overwrite the initial @@ -888,26 +1038,51 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], /// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a /// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. /// -/// \tparam AMatrix KokkosSparse::CrsMatrix or -/// KokkosSparse::Experimental::BsrMatrix +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank 1 or 2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank 1 or 2 Kokkos::View and its rank must match that of XVector /// +/// \param space [in] The execution space instance on which to run the +/// kernel. /// \param controls [in] kokkos-kernels control structure -/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" -/// for conjugate transpose. -/// \param alpha [in] Scalar multiplier for the matrix A. -/// \param A [in] The sparse matrix A. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. /// \param x [in] Either a single vector (rank-1 Kokkos::View) or /// multivector (rank-2 Kokkos::View). /// \param beta [in] Scalar multiplier for the (multi)vector y. /// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or /// multivector (rank-2 Kokkos::View). It must have the same number /// of columns as x. -/// -template -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], +template +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y) { + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that both x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -955,22 +1130,71 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(space, y, beta, y); return; } // using RANK_SPECIALISE = typename std::conditional(XVector::rank) == 2, RANK_TWO, RANK_ONE>::type; - spmv(controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE()); + spmv(space, controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE()); } +/// \brief Public interface to local sparse matrix-vector multiply. +/// +/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both +/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View +/// instances, and Op(A) is determined +/// by \c mode. If beta == 0, ignore and overwrite the initial +/// entries of y; if alpha == 0, ignore the entries of A and x. +/// +/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have +/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on +/// Volta or Ampere architectures. On Volta-architecture GPUs the only available +/// precision is mixed-precision fp32 accumulator from fp16 inputs. On +/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, +/// x is fp16, and y is fp32. Otherwise, double-precision is used. The caller +/// may override this by setting the \c "tc_precision" = \c "mixed" or +/// \c "double" as desired. +/// +/// For mixed precision, performance will degrade for blockDim < 16. +/// For double precision, for blockDim < 8. +/// For such cases, consider an alternate SpMV algorithm. +/// +/// May have \c "algorithm" set to \c "native" to bypass TPLs if they are +/// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a +/// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. +/// +/// \tparam AMatrix KokkosSparse::CrsMatrix or +/// KokkosSparse::Experimental::BsrMatrix +/// +/// \param controls [in] kokkos-kernels control structure +/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" +/// for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. +/// \param x [in] Either a single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). +/// \param beta [in] Scalar multiplier for the (multi)vector y. +/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). It must have the same number +/// of columns as x. +template +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y); +} + +#ifndef DOXY /// \brief Catch-all public interface to error on invalid Kokkos::Sparse spmv /// argument types /// -/// This is a catch-all interfaceace that throws a compile-time error if \c +/// This is a catch-all interface that throws a compile-time error if \c /// AMatrix is not a CrsMatrix, or BsrMatrix /// template ::value && + !KokkosSparse::is_crs_matrix::value>::type* = nullptr> +void spmv(const ExecutionSpace& /* space */, + KokkosKernels::Experimental::Controls /*controls*/, + const char[] /*mode*/, const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + // have to arrange this so that the compiler can't tell this is false until + // instantiation + static_assert(KokkosSparse::is_crs_matrix::value || + KokkosSparse::Experimental::is_bsr_matrix::value, + "SpMV: AMatrix must be CrsMatrix or BsrMatrix"); +} +#endif // ifndef DOXY + +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode +/// (see below). +/// +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector +/// +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. template void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, @@ -999,18 +1262,66 @@ void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, spmv(controls, mode, alpha, A, x, beta, y); } +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode +/// (see below). +/// +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector +/// +/// \param space [in] The execution space instance on which to run the +/// kernel. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +template +void spmv(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + KokkosKernels::Experimental::Controls controls; + spmv(space, controls, mode, alpha, A, x, beta, y); +} + namespace Experimental { -template -void spmv_struct(const char mode[], const int stencil_type, +template +void spmv_struct(const ExecutionSpace& space, const char mode[], + const int stencil_type, const Kokkos::View& structure, const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE) { + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_ONE& tag) { // Make sure that both x and y have the same rank. static_assert((int)XVector::rank == (int)YVector::rank, "KokkosSparse::spmv_struct: Vector ranks do not match."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: XVector must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: YVector must be accessible from " + "ExecutionSpace"); // Make sure that x (and therefore y) is rank 1. static_assert( (int)XVector::rank == 1, @@ -1072,24 +1383,23 @@ void spmv_struct(const char mode[], const int stencil_type, YVector_Internal y_i = y; return KokkosSparse::Impl::SPMV_STRUCT< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type*, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type*, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_struct(mode, stencil_type, - structure, alpha, - A_i, x_i, beta, - y_i); + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::spmv_struct(space, mode, stencil_type, structure, + alpha, A_i, x_i, beta, y_i); +} + +template +void spmv_struct(const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE& tag) { + spmv_struct(typename AMatrix::execution_space{}, mode, stencil_type, + structure, alpha, A, x, beta, y, tag); } +namespace Impl { template struct SPMV2D1D_STRUCT { @@ -1099,6 +1409,14 @@ struct SPMV2D1D_STRUCT { Kokkos::HostSpace>& structure, const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y); + + template + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y); }; #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) @@ -1116,6 +1434,18 @@ struct SPMV2D1D_STRUCT + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_ONE()); + return true; + } }; #else template + static bool spmv2d1d_struct( + const ExecutionSpace& /* space*/, const char /*mode*/[], + const int /*stencil_type*/, + const Kokkos::View& /*structure*/, + const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } }; #endif @@ -1148,6 +1489,18 @@ struct SPMV2D1D_STRUCT + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_ONE()); + return true; + } }; #else template + static bool spmv2d1d_struct( + const ExecutionSpace /*space*/, const char /*mode*/[], + const int /*stencil_type*/, + const Kokkos::View& /*structure*/, + const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } }; #endif @@ -1180,6 +1544,18 @@ struct SPMV2D1D_STRUCT + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_ONE()); + return true; + } }; #else template + static bool spmv2d1d_struct( + const ExecutionSpace& /*space*/, const char /*mode*/[], + const int /*stencil_type*/, + const Kokkos::View& /*structure*/, + const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } }; #endif +} // namespace Impl template -void spmv_struct(const char mode[], const int stencil_type, + class YVector, class XLayout = typename XVector::array_layout> +using SPMV2D1D_STRUCT + [[deprecated("KokkosSparse::SPMV2D1D_STRUCT is not part of the public " + "interface - use KokkosSparse::spmv_struct instead")]] = + Impl::SPMV2D1D_STRUCT; + +template +void spmv_struct(const ExecutionSpace& space, const char mode[], + const int stencil_type, const Kokkos::View& structure, const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO) { + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_TWO& tag) { + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: XVector must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: YVector must be accessible from " + "ExecutionSpace"); // Make sure that both x and y have the same rank. static_assert(XVector::rank == YVector::rank, "KokkosBlas::spmv: Vector ranks do not match."); @@ -1261,11 +1674,11 @@ void spmv_struct(const char mode[], const int stencil_type, YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0); // spmv_struct (mode, alpha, A, x_i, beta, y_i); - if (SPMV2D1D_STRUCT:: - spmv2d1d_struct(mode, stencil_type, structure, alpha, A, x_i, beta, - y_i)) { + if (Impl::SPMV2D1D_STRUCT:: + spmv2d1d_struct(space, mode, stencil_type, structure, alpha, A, x_i, + beta, y_i)) { return; } } @@ -1288,24 +1701,24 @@ void spmv_struct(const char mode[], const int stencil_type, YVector_Internal y_i = y; return KokkosSparse::Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>:: - spmv_mv(KokkosKernels::Experimental::Controls(), mode, alpha, A_i, x_i, - beta, y_i); + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::spmv_mv(space, + KokkosKernels::Experimental::Controls(), + mode, alpha, A_i, x_i, beta, y_i); } } +template +void spmv_struct(const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO& tag) { + spmv_struct(typename AMatrix::execution_space{}, mode, stencil_type, + structure, alpha, A, x, beta, y, tag); +} + /// \brief Public interface to structured local sparse matrix-vector multiply. /// /// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both @@ -1342,6 +1755,45 @@ void spmv_struct(const char mode[], const int stencil_type, RANK_SPECIALISE()); } +/// \brief Public interface to structured local sparse matrix-vector multiply. +/// +/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both +/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View +/// instances, A is a KokkosSparse::CrsMatrix, and Op(A) is determined +/// by \c mode. If beta == 0, ignore and overwrite the initial +/// entries of y; if alpha == 0, ignore the entries of A and x. +/// +/// \param space [in] The execution space instance on which to run the +/// kernel. +/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" +/// for conjugate transpose. +/// \param stencil_type +/// \param structure [in] this 1D view stores the # rows in each dimension +/// (i,j,k) +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix; KokkosSparse::CrsMatrix instance. +/// \param x [in] Either a +/// single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). +/// \param beta [in] Scalar multiplier for the (multi)vector y. +/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). It must have the same number +/// of columns as x. +template +void spmv_struct(const ExecutionSpace& space, const char mode[], + const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + typedef + typename std::conditional::type + RANK_SPECIALISE; + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_SPECIALISE()); +} + } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index b9c1f6c1dd..07bb0a0f0a 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -25,8 +25,7 @@ namespace KokkosSparse { namespace Experimental { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_bsrmatrix_tpl_spec_avail { enum : bool { value = false }; }; @@ -41,12 +40,15 @@ struct spmv_bsrmatrix_tpl_spec_avail { SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \ template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - YL, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR*, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ enum : bool { value = true }; \ }; @@ -125,17 +127,22 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const MKL_INT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -159,10 +166,9 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #endif // Specialization struct which defines whether a specialization exists -template ::type>::value> + std::is_integral_v> struct spmv_mv_bsrmatrix_tpl_spec_avail { enum : bool { value = false }; }; @@ -173,17 +179,21 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { // These versions of cuSPARSE require the ordinal and offset types to be the // same. For KokkosKernels, this means int/int only. // cuSparse level 3 does not currently support LayoutRight -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ - SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, false> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false> { \ + enum : bool { value = true }; \ }; #if (9000 <= CUDA_VERSION) @@ -221,16 +231,23 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const int, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -257,17 +274,20 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #include "KokkosSparse_Utils_rocsparse.hpp" -#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE( \ - SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ - template <> \ - struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + Kokkos::HIP, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50200 diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 78ce736173..8932beb88a 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -20,6 +20,7 @@ #include "KokkosKernels_AlwaysFalse.hpp" #include "KokkosKernels_Controls.hpp" #include "KokkosSparse_Utils_mkl.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include @@ -198,40 +199,47 @@ inline void spm_mv_block_impl_mkl( #endif -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV_BSRMATRIX< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using AMatrix = \ - BsrMatrix, MKL_INT const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_bsrmatrix( \ - const KokkosKernels::Experimental::Controls& /*controls*/, \ - const char mode[], const coefficient_type& alpha, const AMatrix& A, \ - const XVector& X, const coefficient_type& beta, const YVector& Y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ - A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), X.data(), \ - Y.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix( \ + const EXECSPACE&, \ + const KokkosKernels::Experimental::Controls& /*controls*/, \ + const char mode[], const coefficient_type& alpha, const AMatrix& A, \ + const XVector& X, const coefficient_type& beta, const YVector& Y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.blockDim(), A.graph.row_map.data(), \ + A.graph.entries.data(), A.values.data(), X.data(), \ + Y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -259,18 +267,23 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const**, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ - using AMatrix = \ - BsrMatrix, MKL_INT const>; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -279,6 +292,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, using coefficient_type = typename YVector::non_const_value_type; \ \ static void spmv_mv_bsrmatrix( \ + const EXECSPACE&, \ const KokkosKernels::Experimental::Controls& /*controls*/, \ const char mode[], const coefficient_type& alpha, const AMatrix& A, \ const XVector& X, const coefficient_type& beta, const YVector& Y) { \ @@ -344,6 +358,7 @@ namespace Impl { template void spmv_block_impl_cusparse( + const Kokkos::Cuda& exec, const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -354,6 +369,8 @@ void spmv_block_impl_cusparse( /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -456,6 +473,7 @@ template < typename YVector::array_layout>::value, bool> = true> void spm_mv_block_impl_cusparse( + const Kokkos::Cuda& exec, const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -466,6 +484,8 @@ void spm_mv_block_impl_cusparse( /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -557,16 +577,21 @@ void spm_mv_block_impl_cusparse( COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = BsrMatrix; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const*, LAYOUT, device_type, \ Kokkos::MemoryTraits>; \ @@ -576,7 +601,8 @@ void spm_mv_block_impl_cusparse( \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_bsrmatrix(const Controls& controls, const char mode[], \ + static void spmv_bsrmatrix(const Kokkos::Cuda& exec, \ + const Controls& controls, const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -584,7 +610,7 @@ void spm_mv_block_impl_cusparse( std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y); \ + spmv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -648,16 +674,23 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, ETI_AVAIL) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const**, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, false, true, ETI_AVAIL> { \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true, ETI_AVAIL> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = BsrMatrix; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -667,7 +700,8 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_mv_bsrmatrix(const Controls& controls, const char mode[], \ + static void spmv_mv_bsrmatrix(const Kokkos::Cuda& exec, \ + const Controls& controls, const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -675,7 +709,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spm_mv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y); \ + spm_mv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -731,6 +765,7 @@ namespace Impl { template void spmv_block_impl_rocsparse( + const Kokkos::HIP& exec, const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -901,16 +936,21 @@ void spmv_block_impl_rocsparse( COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + Kokkos::HIP, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = BsrMatrix; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const*, LAYOUT, device_type, \ Kokkos::MemoryTraits>; \ @@ -920,7 +960,8 @@ void spmv_block_impl_rocsparse( \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_bsrmatrix(const Controls& controls, const char mode[], \ + static void spmv_bsrmatrix(const Kokkos::HIP& exec, \ + const Controls& controls, const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -928,7 +969,7 @@ void spmv_block_impl_rocsparse( std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_rocsparse(controls, mode, alpha, A, x, beta, y); \ + spmv_block_impl_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 529abc82b7..5e33df1fa3 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -21,25 +21,27 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template ::type>::value> + std::is_integral_v> struct spmv_mv_tpl_spec_avail { enum : bool { value = false }; }; -#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ - XL, YL, MEMSPACE) \ - template <> \ - struct spmv_mv_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, YL, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ + XL, YL, MEMSPACE) \ + template <> \ + struct spmv_mv_tpl_spec_avail< \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR**, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; /* CUSPARSE_VERSION 10300 and lower seem to have a bug in cusparseSpMM diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 717c62b985..dbf94c913d 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -89,7 +89,8 @@ cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { } template -void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, +void spmv_mv_cusparse(const Kokkos::Cuda &exec, + const KokkosKernels::Experimental::Controls &controls, const char mode[], typename YVector::non_const_value_type const &alpha, const AMatrix &A, const XVector &x, @@ -108,6 +109,8 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t opA; @@ -191,39 +194,43 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); } -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV_MV< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const **, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR **, YL, Kokkos::Device, \ - Kokkos::MemoryTraits, false, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const **, XL, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - using Controls = KokkosKernels::Experimental::Controls; \ - static void spmv_mv(const Controls &controls, const char mode[], \ - const coefficient_type &alpha, const AMatrix &A, \ - const XVector &x, const coefficient_type &beta, \ - const YVector &y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_mv_cusparse(controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_MV< \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const **, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const **, XL, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + using Controls = KokkosKernels::Experimental::Controls; \ + static void spmv_mv(const Kokkos::Cuda &exec, const Controls &controls, \ + const char mode[], const coefficient_type &alpha, \ + const AMatrix &A, const XVector &x, \ + const coefficient_type &beta, const YVector &y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; /* cusparseSpMM with following restrictions diff --git a/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp index 707e653803..7b853c953c 100644 --- a/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp @@ -20,15 +20,13 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_struct_tpl_spec_avail { enum : bool { value = false }; }; // Specialization struct which defines whether a specialization exists -template +template struct spmv_mv_struct_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index a8632263f9..1df4a7e5c9 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -24,8 +24,7 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_tpl_spec_avail { enum : bool { value = false }; }; @@ -40,12 +39,15 @@ struct spmv_tpl_spec_avail { YL, MEMSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - YL, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR*, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ enum : bool { value = true }; \ }; @@ -181,22 +183,25 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ - template <> \ - struct spmv_tpl_spec_avail< \ - const SCALAR, const rocsparse_int, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const rocsparse_int, \ - const SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::HIP, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const rocsparse_int>, \ + Kokkos::View< \ + const SCALAR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(double, Kokkos::LayoutLeft) @@ -215,17 +220,22 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_tpl_spec_avail< \ - const SCALAR, const MKL_INT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -243,19 +253,22 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif #ifdef KOKKOS_ENABLE_SYCL -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ - template <> \ - struct spmv_tpl_spec_avail< \ - const SCALAR, const ORDINAL, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const ORDINAL, const SCALAR*, \ - Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const ORDINAL>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index fea98c591a..e0c27099ea 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -28,7 +28,8 @@ namespace KokkosSparse { namespace Impl { template -void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, +void spmv_cusparse(const Kokkos::Cuda& exec, + const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, @@ -39,6 +40,8 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -53,6 +56,11 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, throw std::invalid_argument("Invalid mode"); } } + // cuSPARSE doesn't directly support mode H with real values, but this is + // equivalent to mode T + if (myCusparseOperation == CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE && + !Kokkos::ArithTraits::isComplex) + myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE; #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) @@ -193,39 +201,43 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, #endif // CUDA_VERSION } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv(const Controls& controls, const char mode[], \ - const coefficient_type& alpha, const AMatrix& A, \ - const XVector& x, const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_cusparse(controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const Kokkos::Cuda& exec, const Controls& controls, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; // BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate @@ -350,7 +362,8 @@ namespace KokkosSparse { namespace Impl { template -void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, +void spmv_rocsparse(const Kokkos::HIP& exec, + const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, @@ -362,6 +375,8 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, /* initialize rocsparse library */ rocsparse_handle handle = controls.getRocsparseHandle(); + /* Set rocsparse to use the given stream until this function exits */ + TemporarySetRocsparseStream(handle, exec); /* Set the operation mode */ rocsparse_operation myRocsparseOperation = mode_kk_to_rocsparse(mode); @@ -450,21 +465,21 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, #define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY) \ template <> \ - struct SPMV, \ - Kokkos::MemoryTraits, rocsparse_int const, \ - SCALAR const*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, true, \ - COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ + struct SPMV< \ + Kokkos::HIP, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + rocsparse_int const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using AMatrix = CrsMatrix; \ @@ -477,14 +492,14 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv(const Controls& controls, const char mode[], \ - const coefficient_type& alpha, const AMatrix& A, \ - const XVector& x, const coefficient_type& beta, \ - const YVector& y) { \ + static void spmv(const Kokkos::HIP& exec, const Controls& controls, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_rocsparse(controls, mode, alpha, A, x, beta, y); \ + spmv_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -600,16 +615,23 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, beta_mkl, reinterpret_cast(y))); } +// Note: classic MKL runs on Serial/OpenMP but can't use our execution space +// instances #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ - struct SPMV< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + struct SPMV, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ CrsMatrix alpha, using coefficient_type = typename YVector::non_const_value_type; \ using Controls = KokkosKernels::Experimental::Controls; \ \ - static void spmv(const Controls&, const char mode[], \ + static void spmv(const EXECSPACE&, const Controls&, const char mode[], \ const coefficient_type& alpha, const AMatrix& A, \ const XVector& x, const coefficient_type& beta, \ const YVector& y) { \ @@ -635,7 +657,6 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::Profiling::popRegion(); \ } \ }; -#endif #ifdef KOKKOS_ENABLE_SERIAL KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -658,6 +679,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #endif #undef KOKKOSSPARSE_SPMV_MKL +#endif #ifdef KOKKOS_ENABLE_SYCL inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { @@ -678,8 +700,7 @@ template <> struct spmv_onemkl_wrapper { template - static void spmv(const execution_space& exec, - oneapi::mkl::transpose const mkl_mode, + static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, typename matrix_type::non_const_value_type const alpha, const matrix_type& A, const xview_type& x, typename matrix_type::non_const_value_type const beta, @@ -687,6 +708,12 @@ struct spmv_onemkl_wrapper { using scalar_type = typename matrix_type::non_const_value_type; using ordinal_type = typename matrix_type::non_const_ordinal_type; + // oneAPI doesn't directly support mode H with real values, but this is + // equivalent to mode T + if (mkl_mode == oneapi::mkl::transpose::conjtrans && + !Kokkos::ArithTraits::isComplex) + mkl_mode = oneapi::mkl::transpose::trans; + oneapi::mkl::sparse::matrix_handle_t handle = nullptr; oneapi::mkl::sparse::init_matrix_handle(&handle); auto ev_set = oneapi::mkl::sparse::set_csr_data( @@ -710,8 +737,7 @@ template <> struct spmv_onemkl_wrapper { template - static void spmv(const execution_space& exec, - oneapi::mkl::transpose const mkl_mode, + static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, typename matrix_type::non_const_value_type const alpha, const matrix_type& A, const xview_type& x, typename matrix_type::non_const_value_type const beta, @@ -742,44 +768,47 @@ struct spmv_onemkl_wrapper { } }; -#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV, \ - Kokkos::MemoryTraits, ORDINAL const, \ - SCALAR const*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, true, \ - COMPILE_LIBRARY> { \ - using execution_space = Kokkos::Experimental::SYCL; \ - using device_type = Kokkos::Device; \ - using AMatrix = \ - CrsMatrix, ORDINAL const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - static void spmv(const Controls&, const char mode[], \ - const coefficient_type& alpha, const AMatrix& A, \ - const XVector& x, const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ - execution_space exec{}; \ - spmv_onemkl_wrapper::is_complex>::spmv( \ - exec, mkl_mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, ORDINAL const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using device_type = Kokkos::Device; \ + using AMatrix = \ + CrsMatrix, ORDINAL const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv(const execution_space& exec, const Controls&, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ + spmv_onemkl_wrapper::is_complex>::spmv( \ + exec, mkl_mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, diff --git a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp index 98affff57d..224b72e2b7 100644 --- a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp +++ b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp @@ -490,9 +490,7 @@ void test_replaceSumIntoLonger() { // FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name" #ifndef KOKKOS_ENABLE_SYCL - #include +#endif // KOKKOS_ENABLE_SYCL #undef KOKKOSKERNELS_EXECUTE_TEST - -#endif // KOKKOS_ENABLE_SYCL diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 16a6b56a48..ca5c5a22f4 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -90,10 +90,23 @@ struct fSPMV { if (error > eps * max_val) { err++; KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "expected_y(%d)=%f, y(%d)=%f err=%f, max_error=%f\n", i, + "expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); } } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, value_type &err) const { + const mag_type error = AT::abs(expected_y(i, j) - y(i, j)); + + if (error > eps * max_val) { + err++; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", i, j, + AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, + eps * max_val); + } + } }; template @@ -1112,6 +1125,123 @@ void test_github_issue_101() { } } +template +void test_spmv_all_interfaces_light() { + // Using a small matrix, run through the various SpMV interfaces and + // make sure they produce the correct results. + using execution_space = typename DeviceType::execution_space; + using mag_t = typename Kokkos::ArithTraits::mag_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + const lno_t m = 111; + const lno_t n = 99; + const mag_t maxVal = 10.0; + const mag_t eps = 10.0 * Kokkos::ArithTraits::eps(); + size_type nnz = 600; + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( + m, n, nnz, 2, lno_t(n * 0.7)); + // note: A's values are in range [0, 50) + const mag_t maxError = (nnz / m) * 50.0 * maxVal; + using multivector_t = Kokkos::View; + using vector_t = Kokkos::View; + using range1D_t = Kokkos::RangePolicy; + using range2D_t = Kokkos::MDRangePolicy>; + multivector_t x_mv("x_mv", n, 3); + vector_t x("x", n); + // Randomize x (it won't be modified after that) + Kokkos::fill_random(x_mv, rand_pool, randomUpperBound(maxVal)); + Kokkos::fill_random(x, rand_pool, randomUpperBound(maxVal)); + multivector_t y_mv("y_mv", m, 3); + vector_t y("y", m); + // Compute the correct y = Ax once + multivector_t ygold_mv("ygold_mv", m, 3); + vector_t ygold("ygold", m); + for (lno_t i = 0; i < 3; i++) + Test::sequential_spmv(A, Kokkos::subview(x_mv, Kokkos::ALL(), i), + Kokkos::subview(ygold_mv, Kokkos::ALL(), i), 1.0, + 0.0); + Test::sequential_spmv(A, x, ygold, 1.0, 0.0); + auto clear_y = [&]() { Kokkos::deep_copy(y_mv, scalar_t(0)); }; + auto verify = [&]() { + int num_errors = 0; + Kokkos::parallel_reduce( + "KokkosSparse::Test::spmv", range1D_t(0, m), + Test::fSPMV(ygold, y, eps, maxError), num_errors); + EXPECT_EQ(num_errors, 0); + }; + auto verify_mv = [&]() { + int num_errors = 0; + Kokkos::parallel_reduce("KokkosSparse::Test::spmv", + range2D_t({0, 0}, {m, 3}), + Test::fSPMV( + ygold_mv, y_mv, eps, maxError), + num_errors); + EXPECT_EQ(num_errors, 0); + }; + // Now run through the interfaces and check results each time. + execution_space space; + std::vector space_partitions; + if (space.concurrency() > 1) { + space_partitions = Kokkos::Experimental::partition_space(space, 1, 1); + space = space_partitions[1]; + } + KokkosKernels::Experimental::Controls controls; + // All tagged versions + KokkosSparse::spmv(space, controls, "N", 1.0, A, x, 0.0, y, + KokkosSparse::RANK_ONE()); + space.fence(); + verify(); + clear_y(); + KokkosSparse::spmv(controls, "N", 1.0, A, x, 0.0, y, + KokkosSparse::RANK_ONE()); + verify(); + clear_y(); + KokkosSparse::spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv, + KokkosSparse::RANK_TWO()); + space.fence(); + verify_mv(); + clear_y(); + KokkosSparse::spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv, + KokkosSparse::RANK_TWO()); + verify_mv(); + clear_y(); + // Non-tagged versions + // space and controls + spmv(space, controls, "N", 1.0, A, x, 0.0, y); + space.fence(); + verify(); + clear_y(); + spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv); + space.fence(); + verify_mv(); + clear_y(); + // controls + spmv(controls, "N", 1.0, A, x, 0.0, y); + verify(); + clear_y(); + spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv); + verify_mv(); + clear_y(); + // space + spmv(space, "N", 1.0, A, x, 0.0, y); + space.fence(); + verify(); + clear_y(); + spmv(space, "N", 1.0, A, x_mv, 0.0, y_mv); + space.fence(); + verify_mv(); + clear_y(); + // neither + spmv("N", 1.0, A, x, 0.0, y); + verify(); + clear_y(); + spmv("N", 1.0, A, x_mv, 0.0, y_mv); + verify_mv(); + clear_y(); +} + #define EXECUTE_TEST_ISSUE_101(DEVICE) \ TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \ test_github_issue_101(); \ @@ -1136,6 +1266,14 @@ void test_github_issue_101() { 100, 5); \ } +#define EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse_spmv_interfaces_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + test_spmv_all_interfaces_light(); \ + } + #define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ TEST_F( \ TestCategory, \ @@ -1198,9 +1336,10 @@ EXECUTE_TEST_ISSUE_101(TestExecSpace) (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ - EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ + EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ + EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) #include @@ -1212,8 +1351,9 @@ EXECUTE_TEST_ISSUE_101(TestExecSpace) (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) \ + EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) #include From d398cd976ec48fc35a3e92a9c7a159c0065d8a92 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 24 Aug 2023 08:04:16 -0600 Subject: [PATCH 147/231] Implement PR feedback Remove a couple fences from the unit tests. --- docs/developer/apidocs/sparse.rst | 14 +++++++------- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 2 -- sparse/impl/KokkosSparse_gauss_seidel_spec.hpp | 16 ++++++++-------- sparse/src/KokkosKernels_Handle.hpp | 2 +- sparse/src/KokkosSparse_gauss_seidel.hpp | 12 ++++++------ sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 10 ++-------- 6 files changed, 24 insertions(+), 32 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index d35a4eb851..165d8334ae 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -61,19 +61,19 @@ block_spgemm gauss_seidel ------------ .. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) -.. doxygenfunction:: create_gs_handle(HandleExecSpace, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) -.. doxygenfunction:: gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) -.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: symmetric_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 45f42083a6..7391e00e3d 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1860,7 +1860,6 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); - my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( @@ -1946,7 +1945,6 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); - my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 84c9dccf5c..840ced73b8 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -128,7 +128,7 @@ template ::value> struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, bool is_graph_symmetric); @@ -144,13 +144,13 @@ template < KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value> struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric); static void gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, @@ -169,7 +169,7 @@ template ::value> struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, @@ -187,7 +187,7 @@ struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t_ entries, bool is_graph_symmetric) { @@ -224,7 +224,7 @@ struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric) { @@ -255,7 +255,7 @@ struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index a23826f864..d5a24ac1f1 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -630,7 +630,7 @@ class KokkosKernelsHandle { */ // clang-format on void create_gs_handle( - HandleExecSpace handle_exec_space, int num_streams, + const HandleExecSpace &handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index f67d3bd17b..036fe1b119 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -46,7 +46,7 @@ namespace Experimental { /// template -void gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, +void gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -195,7 +195,7 @@ template -void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -340,7 +340,7 @@ template -void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -540,7 +540,7 @@ template void symmetric_gauss_seidel_apply( - ExecutionSpace &space, KernelHandle *handle, + const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -821,7 +821,7 @@ template void forward_sweep_gauss_seidel_apply( - ExecutionSpace &space, KernelHandle *handle, + const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -1103,7 +1103,7 @@ template void backward_sweep_gauss_seidel_apply( - ExecutionSpace &space, KernelHandle *handle, + const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index a3b4d8ca37..35fbcb44a4 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -808,24 +808,18 @@ void test_gauss_seidel_streams_rank1( int apply_count = 3; // test symmetric, forward, backward //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { - Kokkos::Timer timer1; - for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(instances[i], x_vector_v[i], zero); - for (int i = 0; i < nstreams; i++) instances[i].fence(); run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, y_vector_v, symmetric, m_omega, apply_type, nstreams); - // double gs = timer1.seconds(); - // KokkosKernels::Impl::print_1Dview(x_vector); for (int i = 0; i < nstreams; i++) { - instances[i].fence(); // Wait for apply to finish updating x_vector KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, x_vector_v[i]); mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); - std::string info = "on stream_idx: " + std::to_string(i); - EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; } } From 41b5a398eaf50b1cc5d784c51527d9bef7c44dac Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 24 Aug 2023 10:24:23 -0600 Subject: [PATCH 148/231] Fix #1951 In spmv, for versions where x/y are rank-2 but it works on a single vector subview at a time, use the correct rank-1 vector types on the "impl_type" template params. --- sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 4 +++- sparse/impl/KokkosSparse_spmv_spec.hpp | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 40bbd2035a..99f6cc9bec 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -366,10 +366,12 @@ struct SPMV_MV_BSRMATRIX, "This implementation is only for integer Scalar types."); - typedef SPMV_BSRMATRIX impl_type; for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); + typedef SPMV_BSRMATRIX + impl_type; impl_type::spmv_bsrmatrix(space, controls, mode, alpha, A, x_j, beta, y_j); } diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index 1d509e2aed..8bd52088a5 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -237,11 +237,12 @@ struct SPMV_MV, "This implementation is only for integer Scalar types."); - typedef SPMV impl_type; KokkosKernels::Experimental::Controls defaultControls; for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) { auto x_j = Kokkos::subview(x, Kokkos::ALL(), j); auto y_j = Kokkos::subview(y, Kokkos::ALL(), j); + typedef SPMV + impl_type; impl_type::spmv(space, defaultControls, mode, alpha, A, x_j, beta, y_j); } } From 06e682670ad85d431d6c96fbc80180c459db4cb2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 28 Aug 2023 08:34:34 -0600 Subject: [PATCH 149/231] Ensure CUDA is enabled --- sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 40bbd2035a..ba608c177e 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -216,7 +216,8 @@ struct SPMV_MV_BSRMATRIX Date: Mon, 28 Aug 2023 12:36:34 -0600 Subject: [PATCH 150/231] Disallow BsrMatrix tensor core SpMV for non-scalar types --- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 60 ++++++++++++------- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 11 +--- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index c2eec04fbc..06fe6f094d 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -37,6 +37,40 @@ struct BsrMatrixSpMVTensorCoreFunctorParams { int leagueDim_y; }; +/*! \brief Can the tensor core impl be used in ExecutionSpace to operate on + AMatrix, XMatrix, and YMatrix? +*/ +template +class TensorCoresAvailable { +#if defined(KOKKOS_ENABLE_CUDA) + using AScalar = typename AMatrix::non_const_value_type; + using YScalar = typename YMatrix::non_const_value_type; + using XScalar = typename XMatrix::non_const_value_type; + + using a_mem_space = typename AMatrix::memory_space; + using x_mem_space = typename XMatrix::memory_space; + using y_mem_space = typename YMatrix::memory_space; + + template + constexpr static bool is_scalar() { + return std::is_scalar_v || + std::is_same_v, Kokkos::Experimental::half_t>; + } + + public: + constexpr static inline bool value = + Kokkos::SpaceAccessibility::accessible && + Kokkos::SpaceAccessibility::accessible && + Kokkos::SpaceAccessibility::accessible && + is_scalar() && is_scalar() && is_scalar() && + std::is_same_v; +#else + public: + constexpr static inline bool value = false; +#endif +}; + /// \brief Functor for the BsrMatrix SpMV multivector implementation utilizing /// tensor cores. /// @@ -473,31 +507,13 @@ struct BsrMatrixSpMVTensorCoreDispatcher { "Tensor core SpMV is only supported for non-complex types in GPU " "execution spaces"); } - - /*true if none of T1, T2, or T3 are complex*/ - template - struct none_complex { - const static bool value = !Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex; - }; - - /*true if T1::execution_space, T2, or T3 are all GPU exec space*/ - template - struct all_gpu { - const static bool value = KokkosKernels::Impl::kk_is_gpu_exec_space() && - KokkosKernels::Impl::kk_is_gpu_exec_space() && - KokkosKernels::Impl::kk_is_gpu_exec_space(); - }; - static void dispatch(const execution_space &exec, YScalar alpha, AMatrix a, XMatrix x, YScalar beta, YMatrix y) { // tag will be false unless all conditions are met - using tag = std::integral_constant< - bool, none_complex::value && - all_gpu::value>; + using tag = + std::integral_constant::value>; tag_dispatch(tag{}, exec, alpha, a, x, beta, y); } }; diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 99f6cc9bec..f4141996d2 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -219,18 +219,13 @@ struct SPMV_MV_BSRMATRIX::is_complex) method = Method::Fallback; - if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; - if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; - // can't use tensor cores outside Nvidia GPU - if constexpr (!std::is_same_v) + if (!KokkosSparse::Experimental::Impl::TensorCoresAvailable< + ExecutionSpace, AMatrix, XVector, YVector>::value) { method = Method::Fallback; + } // can't use tensor cores unless mode is no-transpose if (mode[0] != KokkosSparse::NoTranspose[0]) method = Method::Fallback; #if KOKKOS_HALF_T_IS_FLOAT From 25a55f47e384e371c2bec9e4e84def273f1f48f6 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 29 Aug 2023 10:35:07 -0700 Subject: [PATCH 151/231] Address PR review comments --- sparse/src/KokkosSparse_Utils.hpp | 193 +++++++++++++++++------------- 1 file changed, 109 insertions(+), 84 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 0e5dc6d9b0..45b547a5da 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2339,8 +2339,6 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using row_map_hostmirror_type = typename row_map_type::HostMirror; using entries_hostmirror_type = typename entries_type::HostMirror; using values_hostmirror_type = typename values_type::HostMirror; - using int_view1d_type = - Kokkos::View; using graph_t = typename crsMat_t::StaticCrsGraphType; using out_row_map_type = typename graph_t::row_map_type::non_const_type; @@ -2350,6 +2348,10 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using out_entries_hostmirror_type = typename out_entries_type::HostMirror; using out_values_hostmirror_type = typename out_values_type::HostMirror; + using ordinal_type = typename crsMat_t::non_const_ordinal_type; + using size_type = typename crsMat_t::non_const_size_type; + using offset_view1d_type = Kokkos::View; + row_map_type A_row_map = A.graph.row_map; entries_type A_entries = A.graph.entries; values_type A_values = A.values; @@ -2361,95 +2363,118 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( values_hostmirror_type A_values_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_values); - int A_nrows = static_cast(A_row_map.extent(0)) - 1; - int n_blocks = static_cast(DiagBlk_v.size()); - - int rows_per_block = ((A_nrows % n_blocks) == 0) ? (A_nrows / n_blocks) - : (A_nrows / n_blocks + 1); - - std::vector row_map_v(n_blocks); - std::vector entries_v(n_blocks); - std::vector values_v(n_blocks); - std::vector row_map_h_v(n_blocks); - std::vector entries_h_v(n_blocks); - std::vector values_h_v(n_blocks); - - int row_start = 0; // first row index of i-th diagonal block - int col_start = 0; // first col index of i-th diagonal block - int nrows, ncols; // Nrows, Ncols of i-th diagonal block - for (int i = 0; i < n_blocks; i++) { - nrows = rows_per_block; - if ((row_start + rows_per_block) > A_nrows) { - nrows = A_nrows - row_start; + ordinal_type A_nrows = static_cast(A_row_map.extent(0)) - 1; + ordinal_type A_ncols = static_cast(A.numCols()); + ordinal_type n_blocks = static_cast(DiagBlk_v.size()); + + if (A_nrows != A_ncols) { + std::ostringstream os; + os << "The diagonal block extraction only works with square matrices -- matrix A: " << A_nrows << " x " << A_ncols; + throw std::runtime_error(os.str()); + } + + if (n_blocks == 1) { + // One block case: simply shallow copy A to DiagBlk_v[0] + DiagBlk_v[0] = crsMat_t(A); + } + else { + // n_blocks > 1 + if (A_nrows == 0) { + // Degenerate case: A is an empty matrix + for (ordinal_type i = 0; i < n_blocks; i++) { + DiagBlk_v[i] = crsMat_t(); + } } - col_start = row_start; - ncols = nrows; - - // Rowmap of i-th row-oriented sub-matrix - auto A_row_map_sub = Kokkos::subview( - A_row_map_h, Kokkos::make_pair(row_start, row_start + nrows + 1)); - - // First round: count i-th non-zeros or size of entries_v[i] - int n_entries = 0; - int_view1d_type first("first", nrows); // first position per row - int_view1d_type last("last", nrows); // last position per row - - for (int j = 0; j < nrows; j++) { // loop through each row - int k1 = static_cast(A_row_map_sub(j)); - int k2 = static_cast(A_row_map_sub(j + 1)); - int k; - // Assume column indices are sorted in ascending order - // Find the position of the start column in the row - for (k = k1; k < k2; k++) { - int col = static_cast(A_entries_h(k)); - if (col >= col_start) { - break; + else { + // A_nrows >= 1 + ordinal_type rows_per_block = ((A_nrows % n_blocks) == 0) ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); + + std::vector row_map_v(n_blocks); + std::vector entries_v(n_blocks); + std::vector values_v(n_blocks); + std::vector row_map_h_v(n_blocks); + std::vector entries_h_v(n_blocks); + std::vector values_h_v(n_blocks); + + ordinal_type row_start = 0; // first row index of i-th diagonal block + ordinal_type col_start = 0; // first col index of i-th diagonal block + ordinal_type nrows, ncols; // Nrows, Ncols of i-th diagonal block + + for (ordinal_type i = 0; i < n_blocks; i++) { + nrows = rows_per_block; + if ((row_start + rows_per_block) > A_nrows) { + nrows = A_nrows - row_start; } - } - first(j) = k; - // Find the position of the last column in the row - for (k = k2 - 1; k >= k1; k--) { - int col = static_cast(A_entries_h(k)); - if (col < col_start + ncols) { - break; + col_start = row_start; + ncols = nrows; + + // Rowmap of i-th row-oriented sub-matrix + auto A_row_map_sub = Kokkos::subview( + A_row_map_h, Kokkos::make_pair(row_start, row_start + nrows + 1)); + + // First round: count i-th non-zeros or size of entries_v[i] + size_type n_entries = 0; + offset_view1d_type first("first", nrows); // first position per row + offset_view1d_type last("last", nrows); // last position per row + + for (ordinal_type j = 0; j < nrows; j++) { // loop through each row + size_type k1 = A_row_map_sub(j); + size_type k2 = A_row_map_sub(j + 1); + size_type k; + // Assume column indices are sorted in ascending order + // Find the position of the start column in the row + for (k = k1; k < k2; k++) { + ordinal_type col = A_entries_h(k); + if (col >= col_start) { + break; + } + } + first(j) = k; + // Find the position of the last column in the row + for (k = k2 - 1; k >= k1; k--) { + ordinal_type col = A_entries_h(k); + if (col < col_start + ncols) { + break; + } + } + last(j) = k; + n_entries += (last(j) - first(j) + 1); } - } - last(j) = k; - n_entries += (last(j) - first(j) + 1); - } - // Second round: - // - create row_map_v[i] - // - copy A_entries to entries_v[i] and update entries_v[i] with local - // column indices - // - copy A_values to values_v[i] - row_map_v[i] = out_row_map_type("row_map_v", nrows + 1); - entries_v[i] = out_entries_type("entries_v", n_entries); - values_v[i] = out_values_type("values_v", n_entries); - row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", nrows + 1); - entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", n_entries); - values_h_v[i] = out_values_hostmirror_type("values_h_v", n_entries); - int first_ = 0; - for (int j = 0; j < nrows; j++) { // loop through each row - int nnz = last(j) - first(j) + 1; - row_map_h_v[i](j) = first_; - for (int k = 0; k < nnz; k++) { - entries_h_v[i](first_ + k) = A_entries_h(first(j) + k) - col_start; - values_h_v[i](first_ + k) = A_values_h(first(j) + k); - } - first_ += nnz; - } - row_map_h_v[i](nrows) = n_entries; // last element + // Second round: + // - create row_map_v[i] + // - copy A_entries to entries_v[i] and update entries_v[i] with local + // column indices + // - copy A_values to values_v[i] + row_map_v[i] = out_row_map_type("row_map_v", nrows + 1); + entries_v[i] = out_entries_type("entries_v", n_entries); + values_v[i] = out_values_type("values_v", n_entries); + row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", nrows + 1); + entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", n_entries); + values_h_v[i] = out_values_hostmirror_type("values_h_v", n_entries); + size_type first_ = 0; + for (ordinal_type j = 0; j < nrows; j++) { // loop through each row + size_type nnz = last(j) - first(j) + 1; + row_map_h_v[i](j) = first_; + for (size_type k = 0; k < nnz; k++) { + entries_h_v[i](first_ + k) = A_entries_h(first(j) + k) - col_start; + values_h_v[i](first_ + k) = A_values_h(first(j) + k); + } + first_ += nnz; + } + row_map_h_v[i](nrows) = n_entries; // last element - Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); - Kokkos::deep_copy(entries_v[i], entries_h_v[i]); - Kokkos::deep_copy(values_v[i], values_h_v[i]); + Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); + Kokkos::deep_copy(entries_v[i], entries_h_v[i]); + Kokkos::deep_copy(values_v[i], values_h_v[i]); - DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, values_v[i], - row_map_v[i], entries_v[i]); + DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, values_v[i], + row_map_v[i], entries_v[i]); - row_start += nrows; - } + row_start += nrows; + } // for (ordinal_type i = 0; i < n_blocks; i++) + } // A_nrows >= 1 + } // n_blocks > 1 } } // namespace Impl From 58f27b0e9c8677e459eb35a6adedb6303287f51d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 29 Aug 2023 11:51:23 -0600 Subject: [PATCH 152/231] sparse/src: Add doxygen-style comment --- sparse/src/KokkosSparse_Utils.hpp | 46 ++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 45b547a5da..2c20dc71f4 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2330,6 +2330,17 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, } } +/** + * @brief Extract the diagonal blocks out of a crs matrix. + * This is a blocking function that runs on the host. + * + * @tparam crsMat_t The type of the CRS matrix + * @param A The CrsMatrix. + * @param DiagBlk_v [in/out] The location for extracting the diagonal blocks. + * + * Usage Example: + * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); + */ template void kk_extract_diagonal_blocks_crsmatrix_sequential( const crsMat_t &A, std::vector &DiagBlk_v) { @@ -2350,7 +2361,8 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using ordinal_type = typename crsMat_t::non_const_ordinal_type; using size_type = typename crsMat_t::non_const_size_type; - using offset_view1d_type = Kokkos::View; + using offset_view1d_type = + Kokkos::View; row_map_type A_row_map = A.graph.row_map; entries_type A_entries = A.graph.entries; @@ -2369,33 +2381,35 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( if (A_nrows != A_ncols) { std::ostringstream os; - os << "The diagonal block extraction only works with square matrices -- matrix A: " << A_nrows << " x " << A_ncols; + os << "The diagonal block extraction only works with square matrices -- " + "matrix A: " + << A_nrows << " x " << A_ncols; throw std::runtime_error(os.str()); } if (n_blocks == 1) { // One block case: simply shallow copy A to DiagBlk_v[0] DiagBlk_v[0] = crsMat_t(A); - } - else { + } else { // n_blocks > 1 if (A_nrows == 0) { // Degenerate case: A is an empty matrix for (ordinal_type i = 0; i < n_blocks; i++) { DiagBlk_v[i] = crsMat_t(); } - } - else { + } else { // A_nrows >= 1 - ordinal_type rows_per_block = ((A_nrows % n_blocks) == 0) ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); - + ordinal_type rows_per_block = ((A_nrows % n_blocks) == 0) + ? (A_nrows / n_blocks) + : (A_nrows / n_blocks + 1); + std::vector row_map_v(n_blocks); std::vector entries_v(n_blocks); std::vector values_v(n_blocks); std::vector row_map_h_v(n_blocks); std::vector entries_h_v(n_blocks); std::vector values_h_v(n_blocks); - + ordinal_type row_start = 0; // first row index of i-th diagonal block ordinal_type col_start = 0; // first col index of i-th diagonal block ordinal_type nrows, ncols; // Nrows, Ncols of i-th diagonal block @@ -2452,9 +2466,9 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", nrows + 1); entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", n_entries); values_h_v[i] = out_values_hostmirror_type("values_h_v", n_entries); - size_type first_ = 0; + size_type first_ = 0; for (ordinal_type j = 0; j < nrows; j++) { // loop through each row - size_type nnz = last(j) - first(j) + 1; + size_type nnz = last(j) - first(j) + 1; row_map_h_v[i](j) = first_; for (size_type k = 0; k < nnz; k++) { entries_h_v[i](first_ + k) = A_entries_h(first(j) + k) - col_start; @@ -2468,13 +2482,13 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( Kokkos::deep_copy(entries_v[i], entries_h_v[i]); Kokkos::deep_copy(values_v[i], values_h_v[i]); - DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, values_v[i], - row_map_v[i], entries_v[i]); + DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, + values_v[i], row_map_v[i], entries_v[i]); row_start += nrows; - } // for (ordinal_type i = 0; i < n_blocks; i++) - } // A_nrows >= 1 - } // n_blocks > 1 + } // for (ordinal_type i = 0; i < n_blocks; i++) + } // A_nrows >= 1 + } // n_blocks > 1 } } // namespace Impl From 2d963e9f6657057bdb2cb8e6ef3fb9dc9980654e Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Tue, 29 Aug 2023 11:55:34 -0600 Subject: [PATCH 153/231] Apply clang format --- sparse/src/KokkosSparse_Utils.hpp | 35 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 45b547a5da..c2125c5c96 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2350,7 +2350,8 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using ordinal_type = typename crsMat_t::non_const_ordinal_type; using size_type = typename crsMat_t::non_const_size_type; - using offset_view1d_type = Kokkos::View; + using offset_view1d_type = + Kokkos::View; row_map_type A_row_map = A.graph.row_map; entries_type A_entries = A.graph.entries; @@ -2369,33 +2370,35 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( if (A_nrows != A_ncols) { std::ostringstream os; - os << "The diagonal block extraction only works with square matrices -- matrix A: " << A_nrows << " x " << A_ncols; + os << "The diagonal block extraction only works with square matrices -- " + "matrix A: " + << A_nrows << " x " << A_ncols; throw std::runtime_error(os.str()); } if (n_blocks == 1) { // One block case: simply shallow copy A to DiagBlk_v[0] DiagBlk_v[0] = crsMat_t(A); - } - else { + } else { // n_blocks > 1 if (A_nrows == 0) { // Degenerate case: A is an empty matrix for (ordinal_type i = 0; i < n_blocks; i++) { DiagBlk_v[i] = crsMat_t(); } - } - else { + } else { // A_nrows >= 1 - ordinal_type rows_per_block = ((A_nrows % n_blocks) == 0) ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); - + ordinal_type rows_per_block = ((A_nrows % n_blocks) == 0) + ? (A_nrows / n_blocks) + : (A_nrows / n_blocks + 1); + std::vector row_map_v(n_blocks); std::vector entries_v(n_blocks); std::vector values_v(n_blocks); std::vector row_map_h_v(n_blocks); std::vector entries_h_v(n_blocks); std::vector values_h_v(n_blocks); - + ordinal_type row_start = 0; // first row index of i-th diagonal block ordinal_type col_start = 0; // first col index of i-th diagonal block ordinal_type nrows, ncols; // Nrows, Ncols of i-th diagonal block @@ -2452,9 +2455,9 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", nrows + 1); entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", n_entries); values_h_v[i] = out_values_hostmirror_type("values_h_v", n_entries); - size_type first_ = 0; + size_type first_ = 0; for (ordinal_type j = 0; j < nrows; j++) { // loop through each row - size_type nnz = last(j) - first(j) + 1; + size_type nnz = last(j) - first(j) + 1; row_map_h_v[i](j) = first_; for (size_type k = 0; k < nnz; k++) { entries_h_v[i](first_ + k) = A_entries_h(first(j) + k) - col_start; @@ -2468,13 +2471,13 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( Kokkos::deep_copy(entries_v[i], entries_h_v[i]); Kokkos::deep_copy(values_v[i], values_h_v[i]); - DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, values_v[i], - row_map_v[i], entries_v[i]); + DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, + values_v[i], row_map_v[i], entries_v[i]); row_start += nrows; - } // for (ordinal_type i = 0; i < n_blocks; i++) - } // A_nrows >= 1 - } // n_blocks > 1 + } // for (ordinal_type i = 0; i < n_blocks; i++) + } // A_nrows >= 1 + } // n_blocks > 1 } } // namespace Impl From 1b9ff060f08da51f7fcda236d3324e349585363a Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 29 Aug 2023 11:06:07 -0700 Subject: [PATCH 154/231] Update document --- sparse/src/KokkosSparse_Utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 2c20dc71f4..c5ef1ad039 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2335,8 +2335,8 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, * This is a blocking function that runs on the host. * * @tparam crsMat_t The type of the CRS matrix - * @param A The CrsMatrix. - * @param DiagBlk_v [in/out] The location for extracting the diagonal blocks. + * @param A [in] The CrsMatrix. + * @param DiagBlk_v [out] The vector of extracted the CRS diagonal blocks. * * Usage Example: * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); From 1c7b6c955ddb738b8a9614d795552c4074aaf5e9 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 29 Aug 2023 12:47:44 -0600 Subject: [PATCH 155/231] Update create_gs_handle docs --- sparse/src/KokkosKernels_Handle.hpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index d5a24ac1f1..d500f19d48 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -652,14 +652,19 @@ class KokkosKernelsHandle { * * @param gs_algorithm Specifies which algorithm to use: * - * KokkosSpace::GS_DEFAULT PointGaussSeidel - * KokkosSpace::GS_PERMUTED ?? - * KokkosSpace::GS_TEAM ?? - * KokkosSpace::GS_CLUSTER ?? - * KokkosSpace::GS_TWOSTAGE ?? + * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. + * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. + * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. + * KokkosSpace::GS_CLUSTER Uses independent clusters of nodes in the graph. Within a cluster, x is updated sequentially. + * For more information, see: https://arxiv.org/pdf/2204.02934.pdf. + * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. + * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: * - * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array From d00b62570dd47bca79c4ea06e30ca9402e85964b Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 30 Aug 2023 02:29:08 -0700 Subject: [PATCH 156/231] Add unit test for kk_extract_diagonal_blocks_crsmatrix_sequential --- sparse/src/KokkosSparse_Utils.hpp | 18 +-- sparse/unit_test/Test_Sparse.hpp | 1 + .../Test_Sparse_extractCrsDiagonalBlocks.hpp | 150 ++++++++++++++++++ 3 files changed, 158 insertions(+), 11 deletions(-) create mode 100644 sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index c5ef1ad039..65e7e4243d 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2344,13 +2344,9 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, template void kk_extract_diagonal_blocks_crsmatrix_sequential( const crsMat_t &A, std::vector &DiagBlk_v) { - using row_map_type = typename crsMat_t::row_map_type; - using entries_type = typename crsMat_t::index_type; - using values_type = typename crsMat_t::values_type; - using row_map_hostmirror_type = typename row_map_type::HostMirror; - using entries_hostmirror_type = typename entries_type::HostMirror; - using values_hostmirror_type = typename values_type::HostMirror; - + using row_map_type = typename crsMat_t::row_map_type; + using entries_type = typename crsMat_t::index_type; + using values_type = typename crsMat_t::values_type; using graph_t = typename crsMat_t::StaticCrsGraphType; using out_row_map_type = typename graph_t::row_map_type::non_const_type; using out_entries_type = typename graph_t::entries_type::non_const_type; @@ -2368,14 +2364,14 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( entries_type A_entries = A.graph.entries; values_type A_values = A.values; - row_map_hostmirror_type A_row_map_h = + auto A_row_map_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_row_map); - entries_hostmirror_type A_entries_h = + auto A_entries_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_entries); - values_hostmirror_type A_values_h = + auto A_values_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_values); - ordinal_type A_nrows = static_cast(A_row_map.extent(0)) - 1; + ordinal_type A_nrows = static_cast(A.numRows()); ordinal_type A_ncols = static_cast(A.numCols()); ordinal_type n_blocks = static_cast(DiagBlk_v.size()); diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index 2eb9f6f122..8ae06b598a 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -43,6 +43,7 @@ #include "Test_Sparse_ccs2crs.hpp" #include "Test_Sparse_crs2ccs.hpp" #include "Test_Sparse_removeCrsMatrixZeros.hpp" +#include "Test_Sparse_extractCrsDiagonalBlocks.hpp" // TPL specific tests, these require // particular pairs of backend and TPL diff --git a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp new file mode 100644 index 0000000000..f74c095ccb --- /dev/null +++ b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosSparse_Utils.hpp" +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +template +void run_test_extract_diagonal_blocks(int nrows, int nblocks) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hm = typename RowMapType::HostMirror; + using EntriesType_hm = typename EntriesType::HostMirror; + using ValuesType_hm = typename ValuesType::HostMirror; + using crsMat_t = CrsMatrix; + using AT = Kokkos::ArithTraits; + + crsMat_t A; + std::vector DiagBlks(nblocks); + + if (nrows != 0) { + // Generate test matrix + const size_type nnz = 2 + (nrows - 2) * 3 + 2; + RowMapType_hm hrow_map("hrow_map", nrows + 1); + EntriesType_hm hentries("hentries", nnz); + ValuesType_hm hvalues ("hvalues", nnz); + + // first row + hrow_map(0) = 0; + hentries(0) = 0; + hentries(1) = 1; + hvalues(0) = 0; + hvalues(1) = 1; + // rows in between + int cnt = 2; + for(int i = 1; i <= (nrows-2); i++) { + hrow_map(i) = cnt; + hentries(cnt) = -1 + i; + hentries(cnt+1) = 0 + i; + hentries(cnt+2) = 1 + i; + hvalues(cnt) = -1 + i; + hvalues(cnt+1) = 0 + i; + hvalues(cnt+2) = 1 + i; + cnt += 3; + } + // last row + hrow_map(nrows-1) = cnt; + hentries(nnz-2) = nrows-2; + hentries(nnz-1) = nrows-1; + hvalues(nnz-2) = nrows-2; + hvalues(nnz-1) = nrows-1; + // last element of row_map + hrow_map(nrows) = nnz; + + // Allocate A on device memory + RowMapType row_map("row_map", nrows + 1); + EntriesType entries("entries", nnz); + ValuesType values ("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map, hrow_map); + Kokkos::deep_copy(entries, hentries); + Kokkos::deep_copy(values, hvalues); + + // Construct a CRS matrix + A = crsMat_t("CrsMatrix", nrows, nrows, nnz, values, row_map, entries); + } + + // Extract + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, DiagBlks); + + // Checking + lno_t numRows = 0; + lno_t numCols = 0; + for(int i = 0; i < nblocks; i++) { + numRows += DiagBlks[i].numRows(); + numCols += DiagBlks[i].numCols(); + } + + EXPECT_TRUE(numRows == static_cast(nrows)); + EXPECT_TRUE(numCols == static_cast(nrows)); + + if (nrows > 0) { + bool flag = true; + lno_t col_start = 0; + for(int i = 0; i < nblocks; i++) { + RowMapType_hm hrow_map_diagblk("hrow_map_diagblk", DiagBlks[i].numRows() + 1); + EntriesType_hm hentries_diagblk("hentries_diagblk", DiagBlks[i].nnz()); + ValuesType_hm hvalues_diagblk ("hvalues_diagblk", DiagBlks[i].nnz()); + + Kokkos::deep_copy(hrow_map_diagblk, DiagBlks[i].graph.row_map); + Kokkos::deep_copy(hentries_diagblk, DiagBlks[i].graph.entries); + Kokkos::deep_copy(hvalues_diagblk, DiagBlks[i].values); + + for(int j = 0; j < static_cast(DiagBlks[i].numRows()); j++) { + size_type k1 = hrow_map_diagblk(j); + size_type k2 = hrow_map_diagblk(j + 1); + for(size_type k = k1; k < k2; k++) { + scalar_t col = static_cast(hentries_diagblk(k) + col_start); + scalar_t val = hvalues_diagblk(k); + if (Kokkos::abs(col- val) != 0) { + flag = false; + break; + } + } + if (flag == false) break; + } + if (flag == false) break; + col_start += DiagBlks[i].numCols(); + } + EXPECT_TRUE(flag); + } +} +} // namespace Test + +template +void test_extract_diagonal_blocks() { + for (int s = 1; s <= 8; s++) { + Test::run_test_extract_diagonal_blocks(0, s); + Test::run_test_extract_diagonal_blocks(3, s); + Test::run_test_extract_diagonal_blocks(12, s); + Test::run_test_extract_diagonal_blocks(123, s); + } +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##extract_diagonal_blocks##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_extract_diagonal_blocks(); \ + } + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST From 8176fe8106454f7c6707708c43fc6e4f91944024 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Wed, 30 Aug 2023 03:37:28 -0600 Subject: [PATCH 157/231] Apply clang format --- .../Test_Sparse_extractCrsDiagonalBlocks.hpp | 113 +- sparse/unit_test/Test_Sparse_spiluk.hpp_ | 517 +++++++ sparse/unit_test/Test_Sparse_sptrsv.hpp_ | 1359 +++++++++++++++++ 3 files changed, 1936 insertions(+), 53 deletions(-) create mode 100644 sparse/unit_test/Test_Sparse_spiluk.hpp_ create mode 100644 sparse/unit_test/Test_Sparse_sptrsv.hpp_ diff --git a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp index f74c095ccb..f7b48c6945 100644 --- a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp +++ b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -27,8 +27,8 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { using RowMapType_hm = typename RowMapType::HostMirror; using EntriesType_hm = typename EntriesType::HostMirror; using ValuesType_hm = typename ValuesType::HostMirror; - using crsMat_t = CrsMatrix; - using AT = Kokkos::ArithTraits; + using crsMat_t = CrsMatrix; + using AT = Kokkos::ArithTraits; crsMat_t A; std::vector DiagBlks(nblocks); @@ -36,58 +36,59 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { if (nrows != 0) { // Generate test matrix const size_type nnz = 2 + (nrows - 2) * 3 + 2; - RowMapType_hm hrow_map("hrow_map", nrows + 1); + RowMapType_hm hrow_map("hrow_map", nrows + 1); EntriesType_hm hentries("hentries", nnz); - ValuesType_hm hvalues ("hvalues", nnz); - + ValuesType_hm hvalues("hvalues", nnz); + // first row hrow_map(0) = 0; hentries(0) = 0; hentries(1) = 1; hvalues(0) = 0; hvalues(1) = 1; - // rows in between + // rows in between int cnt = 2; - for(int i = 1; i <= (nrows-2); i++) { - hrow_map(i) = cnt; - hentries(cnt) = -1 + i; - hentries(cnt+1) = 0 + i; - hentries(cnt+2) = 1 + i; - hvalues(cnt) = -1 + i; - hvalues(cnt+1) = 0 + i; - hvalues(cnt+2) = 1 + i; + for (int i = 1; i <= (nrows - 2); i++) { + hrow_map(i) = cnt; + hentries(cnt) = -1 + i; + hentries(cnt + 1) = 0 + i; + hentries(cnt + 2) = 1 + i; + hvalues(cnt) = -1 + i; + hvalues(cnt + 1) = 0 + i; + hvalues(cnt + 2) = 1 + i; cnt += 3; } // last row - hrow_map(nrows-1) = cnt; - hentries(nnz-2) = nrows-2; - hentries(nnz-1) = nrows-1; - hvalues(nnz-2) = nrows-2; - hvalues(nnz-1) = nrows-1; + hrow_map(nrows - 1) = cnt; + hentries(nnz - 2) = nrows - 2; + hentries(nnz - 1) = nrows - 1; + hvalues(nnz - 2) = nrows - 2; + hvalues(nnz - 1) = nrows - 1; // last element of row_map - hrow_map(nrows) = nnz; - + hrow_map(nrows) = nnz; + // Allocate A on device memory - RowMapType row_map("row_map", nrows + 1); + RowMapType row_map("row_map", nrows + 1); EntriesType entries("entries", nnz); - ValuesType values ("values", nnz); - + ValuesType values("values", nnz); + // Copy from host to device Kokkos::deep_copy(row_map, hrow_map); Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); - + Kokkos::deep_copy(values, hvalues); + // Construct a CRS matrix A = crsMat_t("CrsMatrix", nrows, nrows, nnz, values, row_map, entries); } // Extract - KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, DiagBlks); - + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, + DiagBlks); + // Checking lno_t numRows = 0; lno_t numCols = 0; - for(int i = 0; i < nblocks; i++) { + for (int i = 0; i < nblocks; i++) { numRows += DiagBlks[i].numRows(); numCols += DiagBlks[i].numCols(); } @@ -96,29 +97,30 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { EXPECT_TRUE(numCols == static_cast(nrows)); if (nrows > 0) { - bool flag = true; + bool flag = true; lno_t col_start = 0; - for(int i = 0; i < nblocks; i++) { - RowMapType_hm hrow_map_diagblk("hrow_map_diagblk", DiagBlks[i].numRows() + 1); + for (int i = 0; i < nblocks; i++) { + RowMapType_hm hrow_map_diagblk("hrow_map_diagblk", + DiagBlks[i].numRows() + 1); EntriesType_hm hentries_diagblk("hentries_diagblk", DiagBlks[i].nnz()); - ValuesType_hm hvalues_diagblk ("hvalues_diagblk", DiagBlks[i].nnz()); - + ValuesType_hm hvalues_diagblk("hvalues_diagblk", DiagBlks[i].nnz()); + Kokkos::deep_copy(hrow_map_diagblk, DiagBlks[i].graph.row_map); Kokkos::deep_copy(hentries_diagblk, DiagBlks[i].graph.entries); - Kokkos::deep_copy(hvalues_diagblk, DiagBlks[i].values); - - for(int j = 0; j < static_cast(DiagBlks[i].numRows()); j++) { + Kokkos::deep_copy(hvalues_diagblk, DiagBlks[i].values); + + for (int j = 0; j < static_cast(DiagBlks[i].numRows()); j++) { size_type k1 = hrow_map_diagblk(j); size_type k2 = hrow_map_diagblk(j + 1); - for(size_type k = k1; k < k2; k++) { - scalar_t col = static_cast(hentries_diagblk(k) + col_start); - scalar_t val = hvalues_diagblk(k); - if (Kokkos::abs(col- val) != 0) { - flag = false; - break; - } + for (size_type k = k1; k < k2; k++) { + scalar_t col = static_cast(hentries_diagblk(k) + col_start); + scalar_t val = hvalues_diagblk(k); + if (Kokkos::abs(col - val) != 0) { + flag = false; + break; } - if (flag == false) break; + } + if (flag == false) break; } if (flag == false) break; col_start += DiagBlks[i].numCols(); @@ -132,17 +134,22 @@ template void test_extract_diagonal_blocks() { for (int s = 1; s <= 8; s++) { - Test::run_test_extract_diagonal_blocks(0, s); - Test::run_test_extract_diagonal_blocks(3, s); - Test::run_test_extract_diagonal_blocks(12, s); - Test::run_test_extract_diagonal_blocks(123, s); + Test::run_test_extract_diagonal_blocks( + 0, s); + Test::run_test_extract_diagonal_blocks( + 3, s); + Test::run_test_extract_diagonal_blocks( + 12, s); + Test::run_test_extract_diagonal_blocks( + 123, s); } } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##extract_diagonal_blocks##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_extract_diagonal_blocks(); \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##extract_diagonal_blocks##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_extract_diagonal_blocks(); \ } #include diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp_ b/sparse/unit_test/Test_Sparse_spiluk.hpp_ new file mode 100644 index 0000000000..3115bc9649 --- /dev/null +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp_ @@ -0,0 +1,517 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#include + +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include +#include "KokkosBlas1_nrm2.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_spiluk.hpp" + +#include + +using namespace KokkosSparse; +using namespace KokkosSparse::Experimental; +using namespace KokkosKernels; +using namespace KokkosKernels::Experimental; + +// #ifndef kokkos_complex_double +// #define kokkos_complex_double Kokkos::complex +// #define kokkos_complex_float Kokkos::complex +// #endif + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; + +namespace Test { + +template +void run_test_spiluk() { + typedef Kokkos::View RowMapType; + typedef Kokkos::View EntriesType; + typedef Kokkos::View ValuesType; + typedef Kokkos::ArithTraits AT; + + const size_type nrows = 9; + const size_type nnz = 21; + + RowMapType row_map("row_map", nrows + 1); + EntriesType entries("entries", nnz); + ValuesType values("values", nnz); + + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + scalar_t MONE = scalar_t(-1); + + hrow_map(0) = 0; + hrow_map(1) = 3; + hrow_map(2) = 5; + hrow_map(3) = 6; + hrow_map(4) = 9; + hrow_map(5) = 11; + hrow_map(6) = 13; + hrow_map(7) = 15; + hrow_map(8) = 18; + hrow_map(9) = nnz; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 5; + hentries(3) = 1; + hentries(4) = 6; + hentries(5) = 2; + hentries(6) = 0; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 0; + hentries(10) = 4; + hentries(11) = 1; + hentries(12) = 5; + hentries(13) = 2; + hentries(14) = 6; + hentries(15) = 3; + hentries(16) = 4; + hentries(17) = 7; + hentries(18) = 3; + hentries(19) = 4; + hentries(20) = 8; + + hvalues(0) = 10; + hvalues(1) = 0.3; + hvalues(2) = 0.6; + hvalues(3) = 11; + hvalues(4) = 0.7; + hvalues(5) = 12; + hvalues(6) = 5; + hvalues(7) = 13; + hvalues(8) = 1; + hvalues(9) = 4; + hvalues(10) = 14; + hvalues(11) = 3; + hvalues(12) = 15; + hvalues(13) = 7; + hvalues(14) = 16; + hvalues(15) = 6; + hvalues(16) = 5; + hvalues(17) = 17; + hvalues(18) = 2; + hvalues(19) = 2.5; + hvalues(20) = 18; + + Kokkos::deep_copy(row_map, hrow_map); + Kokkos::deep_copy(entries, hentries); + Kokkos::deep_copy(values, hvalues); + + typedef KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space> + KernelHandle; + + KernelHandle kh; + + // SPILUKAlgorithm::SEQLVLSCHD_RP + { + kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4 * nrows, + 4 * nrows); + + auto spiluk_handle = kh.get_spiluk_handle(); + + // Allocate L and U as outputs + RowMapType L_row_map("L_row_map", nrows + 1); + EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); + ValuesType L_values("L_values", spiluk_handle->get_nnzL()); + RowMapType U_row_map("U_row_map", nrows + 1); + EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); + ValuesType U_values("U_values", spiluk_handle->get_nnzU()); + + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, + U_row_map, U_entries); + + Kokkos::fence(); + + Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); + Kokkos::resize(L_values, spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); + Kokkos::resize(U_values, spiluk_handle->get_nnzU()); + + spiluk_handle->print_algorithm(); + spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, + L_entries, L_values, U_row_map, U_entries, U_values); + + Kokkos::fence(); + + // Checking + typedef CrsMatrix crsMat_t; + crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); + crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, + L_row_map, L_entries); + crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, + U_row_map, U_entries); + + // Create a reference view e set to all 1's + ValuesType e_one("e_one", nrows); + Kokkos::deep_copy(e_one, 1.0); + + // Create two views for spmv results + ValuesType bb("bb", nrows); + ValuesType bb_tmp("bb_tmp", nrows); + + // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) + KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + + typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + + KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); + KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + + typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + + EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + + kh.destroy_spiluk_handle(); + } + + // SPILUKAlgorithm::SEQLVLSCHD_TP1 + { + kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4 * nrows, + 4 * nrows); + + auto spiluk_handle = kh.get_spiluk_handle(); + + // Allocate L and U as outputs + RowMapType L_row_map("L_row_map", nrows + 1); + EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); + ValuesType L_values("L_values", spiluk_handle->get_nnzL()); + RowMapType U_row_map("U_row_map", nrows + 1); + EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); + ValuesType U_values("U_values", spiluk_handle->get_nnzU()); + + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, + U_row_map, U_entries); + + Kokkos::fence(); + + Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); + Kokkos::resize(L_values, spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); + Kokkos::resize(U_values, spiluk_handle->get_nnzU()); + + spiluk_handle->print_algorithm(); + spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, + L_entries, L_values, U_row_map, U_entries, U_values); + + Kokkos::fence(); + + // Checking + typedef CrsMatrix crsMat_t; + crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); + crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, + L_row_map, L_entries); + crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, + U_row_map, U_entries); + + // Create a reference view e set to all 1's + ValuesType e_one("e_one", nrows); + Kokkos::deep_copy(e_one, 1.0); + + // Create two views for spmv results + ValuesType bb("bb", nrows); + ValuesType bb_tmp("bb_tmp", nrows); + + // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) + KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + + typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + + KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); + KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + + typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + + EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + + kh.destroy_spiluk_handle(); + } +} + +template +void run_test_spiluk_streams(int test_algo, int nstreams) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; + using crsMat_t = CrsMatrix; + using AT = Kokkos::ArithTraits; + + // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); + if (env_omp_num_threads != nullptr) { + int num_threads = std::atoi(env_omp_num_threads); + if (num_threads < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; + } + } + } +#endif + if (!run_streams_test) + return; + + const size_type nrows = 9; + const size_type nnz = 21; + + std::vector instances; + if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector A_row_map_v(nstreams); + std::vector A_entries_v(nstreams); + std::vector A_values_v(nstreams); + std::vector L_row_map_v(nstreams); + std::vector L_entries_v(nstreams); + std::vector L_values_v(nstreams); + std::vector U_row_map_v(nstreams); + std::vector U_entries_v(nstreams); + std::vector U_values_v(nstreams); + + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + scalar_t MONE = scalar_t(-1); + + hrow_map(0) = 0; + hrow_map(1) = 3; + hrow_map(2) = 5; + hrow_map(3) = 6; + hrow_map(4) = 9; + hrow_map(5) = 11; + hrow_map(6) = 13; + hrow_map(7) = 15; + hrow_map(8) = 18; + hrow_map(9) = nnz; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 5; + hentries(3) = 1; + hentries(4) = 6; + hentries(5) = 2; + hentries(6) = 0; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 0; + hentries(10) = 4; + hentries(11) = 1; + hentries(12) = 5; + hentries(13) = 2; + hentries(14) = 6; + hentries(15) = 3; + hentries(16) = 4; + hentries(17) = 7; + hentries(18) = 3; + hentries(19) = 4; + hentries(20) = 8; + + hvalues(0) = 10; + hvalues(1) = 0.3; + hvalues(2) = 0.6; + hvalues(3) = 11; + hvalues(4) = 0.7; + hvalues(5) = 12; + hvalues(6) = 5; + hvalues(7) = 13; + hvalues(8) = 1; + hvalues(9) = 4; + hvalues(10) = 14; + hvalues(11) = 3; + hvalues(12) = 15; + hvalues(13) = 7; + hvalues(14) = 16; + hvalues(15) = 6; + hvalues(16) = 5; + hvalues(17) = 17; + hvalues(18) = 2; + hvalues(19) = 2.5; + hvalues(20) = 18; + + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + for (int i = 0; i < nstreams; i++) { + // Allocate A as input + A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); + A_entries_v[i] = EntriesType("A_entries", nnz); + A_values_v[i] = ValuesType("A_values", nnz); + + // Copy from host to device + Kokkos::deep_copy(A_row_map_v[i], hrow_map); + Kokkos::deep_copy(A_entries_v[i], hentries); + Kokkos::deep_copy(A_values_v[i], hvalues); + + // Create handle + kh_v[i] = KernelHandle(); + if (test_algo == 0) + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, + 4 * nrows, 4 * nrows); + else if (test_algo == 1) + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, + 4 * nrows, 4 * nrows); + kh_ptr_v[i] = &kh_v[i]; + + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + std::cout << " Stream " << i << ": "; + spiluk_handle->print_algorithm(); + + // Allocate L and U as outputs + L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); + L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); + L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); + U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); + U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); + U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); + + // Symbolic phase + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], + L_row_map_v[i], L_entries_v[i], U_row_map_v[i], + U_entries_v[i], nstreams); + + Kokkos::fence(); + + Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); + Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); + } // Done handle creation and spiluk_symbolic on all streams + + // Numeric phase + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, + A_entries_v, A_values_v, L_row_map_v, L_entries_v, + L_values_v, U_row_map_v, U_entries_v, U_values_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], + A_entries_v[i]); + crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], + L_row_map_v[i], L_entries_v[i]); + crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], + U_row_map_v[i], U_entries_v[i]); + + // Create a reference view e set to all 1's + ValuesType e_one("e_one", nrows); + Kokkos::deep_copy(e_one, 1.0); + + // Create two views for spmv results + ValuesType bb("bb", nrows); + ValuesType bb_tmp("bb_tmp", nrows); + + // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) + KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + + typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + + KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); + KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + + typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + + EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + + kh_v[i].destroy_spiluk_handle(); + } +} + +} // namespace Test + +template +void test_spiluk() { + Test::run_test_spiluk(); +} + +template +void test_spiluk_streams() { + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; + Test::run_test_spiluk_streams(0, 2); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; + Test::run_test_spiluk_streams(0, 3); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; + Test::run_test_spiluk_streams(0, 4); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; + Test::run_test_spiluk_streams(1, 2); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; + Test::run_test_spiluk_streams(1, 3); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; + Test::run_test_spiluk_streams(1, 4); +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_spiluk(); \ + test_spiluk_streams(); \ + } + +#define NO_TEST_COMPLEX + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST +#undef NO_TEST_COMPLEX diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp_ b/sparse/unit_test/Test_Sparse_sptrsv.hpp_ new file mode 100644 index 0000000000..2425fb4c27 --- /dev/null +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp_ @@ -0,0 +1,1359 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#include + +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#include "KokkosSparse_sptrsv.hpp" +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) +#include "KokkosSparse_sptrsv_supernode.hpp" +#endif + +#include + +using namespace KokkosSparse; +using namespace KokkosSparse::Experimental; +using namespace KokkosKernels; +using namespace KokkosKernels::Impl; +using namespace KokkosKernels::Experimental; + +// #ifndef kokkos_complex_double +// #define kokkos_complex_double Kokkos::complex +// #endif +// #ifndef kokkos_complex_float +// #define kokkos_complex_float Kokkos::complex +// #endif + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; + +namespace Test { + +#if 0 +template +void run_test_sptrsv_mtx() { + + typedef typename KokkosSparse::CrsMatrix crsmat_t; + typedef typename crsmat_t::StaticCrsGraphType graph_t; + + //typedef Kokkos::View< size_type*, device > RowMapType; + //typedef Kokkos::View< lno_t*, device > EntriesType; + typedef Kokkos::View< scalar_t*, device > ValuesType; + + // Lower tri + std::cout << "LowerTriTest Begin" << std::endl; + { + +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-offshore-amd.mtx"; +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Transport-amd.mtx"; +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Fault_639amd.mtx"; +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-thermal2-amd.mtx"; + std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-dielFilterV2real-amd.mtx"; + std::cout << "Matrix file: " << mtx_filename << std::endl; + crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix + graph_t lgraph = triMtx.graph; // in_graph + + auto row_map = lgraph.row_map; + auto entries = lgraph.entries; + auto values = triMtx.values; + + const size_type nrows = lgraph.numRows(); +// const size_type nnz = triMtx.nnz(); + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + + typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; + + std::cout << "UnitTest nrows = " << nrows << std::endl; + + KernelHandle kh; + bool is_lower_tri = true; + std::cout << "Create handle" << std::endl; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + + std::cout << "Prepare linear system" << std::endl; + // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + ValuesType lhs("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); + +// typedef CrsMatrix crsMat_t; +// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + + std::cout << "SPMV" << std::endl; + KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); + + std::cout << "TriSolve Symbolic" << std::endl; + Kokkos::Timer timer; + sptrsv_symbolic( &kh, row_map, entries ); + std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl; + + std::cout << "TriSolve Solve" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + timer.reset(); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + std::cout << "LTRI Solve TEAMPOLICY! Time: " << timer.seconds() << std::endl; + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += lhs(i); + }, sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + } + else { + std::cout << "Lower Tri Solve SUCCESS!" << std::endl; + //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + kh.get_sptrsv_handle()->print_algorithm(); + timer.reset(); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + std::cout << "LTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += lhs(i); + }, sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + } + else { + std::cout << "Lower Tri Solve SUCCESS!" << std::endl; + //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + kh.get_sptrsv_handle()->print_algorithm(); + timer.reset(); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + std::cout << "LTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += lhs(i); + }, sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + } + else { + std::cout << "Lower Tri Solve SUCCESS!" << std::endl; + //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + + kh.destroy_sptrsv_handle(); + } + // Upper tri + std::cout << "UpperTriTest Begin" << std::endl; + { +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-offshore-amd.mtx"; +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Transport-amd.mtx"; +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Fault_639amd.mtx"; +// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-thermal2-amd.mtx"; + std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-dielFilterV2real-amd.mtx"; + std::cout << "Matrix file: " << mtx_filename << std::endl; + crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix + graph_t lgraph = triMtx.graph; // in_graph + + auto row_map = lgraph.row_map; + auto entries = lgraph.entries; + auto values = triMtx.values; + + const size_type nrows = lgraph.numRows(); +// const size_type nnz = triMtx.nnz(); + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + + typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; + + std::cout << "UnitTest nrows = " << nrows << std::endl; + + KernelHandle kh; + bool is_lower_tri = false; + std::cout << "Create handle" << std::endl; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + + std::cout << "Prepare linear system" << std::endl; + // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + ValuesType lhs("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); + +// typedef CrsMatrix crsMat_t; +// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + std::cout << "SPMV" << std::endl; + KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); + + std::cout << "TriSolve Symbolic" << std::endl; + Kokkos::Timer timer; + sptrsv_symbolic( &kh, row_map, entries ); + std::cout << "UTRI Symbolic Time: " << timer.seconds() << std::endl; + + std::cout << "TriSolve Solve" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + timer.reset(); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + std::cout << "UTRI Solve SEQLVLSCHD_TP1 Time: " << timer.seconds() << std::endl; + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += lhs(i); + }, sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + } + else { + std::cout << "Upper Tri Solve SUCCESS!" << std::endl; + //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + kh.get_sptrsv_handle()->print_algorithm(); + timer.reset(); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + std::cout << "UTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += lhs(i); + }, sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + } + else { + std::cout << "Upper Tri Solve SUCCESS!" << std::endl; + //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + kh.get_sptrsv_handle()->print_algorithm(); + timer.reset(); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + std::cout << "UTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += lhs(i); + }, sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + } + else { + std::cout << "Upper Tri Solve SUCCESS!" << std::endl; + //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + + kh.destroy_sptrsv_handle(); + } + +} +#endif + +namespace { +template +struct ReductionCheck { + using lno_t = OrdinalType; + using value_type = ValueType; + + ViewType lhs; + + ReductionCheck(const ViewType &lhs_) : lhs(lhs_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(lno_t i, value_type &tsum) const { tsum += lhs(i); } +}; +} // namespace + +template +void run_test_sptrsv() { + typedef Kokkos::View RowMapType; + typedef Kokkos::View EntriesType; + typedef Kokkos::View ValuesType; + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + + const size_type nrows = 5; + const size_type nnz = 10; + + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space>; + +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) + using host_crsmat_t = typename KernelHandle::SPTRSVHandleType::host_crsmat_t; + using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; + + using row_map_view_t = typename host_graph_t::row_map_type::non_const_type; + using cols_view_t = typename host_graph_t::entries_type::non_const_type; + using values_view_t = typename host_crsmat_t::values_type::non_const_type; + + // L & U handle for supernodal SpTrsv + KernelHandle khL; + KernelHandle khU; + + // right-hand-side and solution + ValuesType B("rhs", nrows); + ValuesType X("sol", nrows); + + // host CRS for L & U + host_crsmat_t L, U, Ut; +#endif + + // Upper tri + { + RowMapType row_map("row_map", nrows + 1); + EntriesType entries("entries", nnz); + ValuesType values("values", nnz); + + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + hrow_map(0) = 0; + hrow_map(1) = 2; + hrow_map(2) = 4; + hrow_map(3) = 7; + hrow_map(4) = 9; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 1; + hentries(3) = 4; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 4; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + Kokkos::deep_copy(row_map, hrow_map); + Kokkos::deep_copy(entries, hentries); + Kokkos::deep_copy(values, hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + ValuesType lhs("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); + + typedef CrsMatrix crsMat_t; + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + + { + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + // FIXME Issues with various integral type combos - algorithm currently + // unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << + "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + */ + + kh.destroy_sptrsv_handle(); + } + + { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, + is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + kh.destroy_sptrsv_handle(); + } + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if (std::is_same::value && + std::is_same::value && + std::is_same::value) { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + kh.destroy_sptrsv_handle(); + } +#endif + +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) + const scalar_t FIVE = scalar_t(5); + const size_type nnz_sp = 14; + { + // U in csr + row_map_view_t hUrowptr("hUrowptr", nrows + 1); + cols_view_t hUcolind("hUcolind", nnz_sp); + values_view_t hUvalues("hUvalues", nnz_sp); + + // rowptr + hUrowptr(0) = 0; + hUrowptr(1) = 4; + hUrowptr(2) = 8; + hUrowptr(3) = 11; + hUrowptr(4) = 13; + hUrowptr(5) = 14; + + // colind + // first row (first supernode) + hUcolind(0) = 0; + hUcolind(1) = 1; + hUcolind(2) = 2; + hUcolind(3) = 4; + // second row (first supernode) + hUcolind(4) = 0; + hUcolind(5) = 1; + hUcolind(6) = 2; + hUcolind(7) = 4; + // third row (second supernode) + hUcolind(8) = 2; + hUcolind(9) = 3; + hUcolind(10) = 4; + // fourth row (third supernode) + hUcolind(11) = 3; + hUcolind(12) = 4; + // fifth row (fourth supernode) + hUcolind(13) = 4; + + // values + // first row (first supernode) + hUvalues(0) = FIVE; + hUvalues(1) = ONE; + hUvalues(2) = ONE; + hUvalues(3) = ZERO; + // second row (first supernode) + hUvalues(4) = ZERO; + hUvalues(5) = FIVE; + hUvalues(6) = ZERO; + hUvalues(7) = ONE; + // third row (second supernode) + hUvalues(8) = FIVE; + hUvalues(9) = ONE; + hUvalues(10) = ONE; + // fourth row (third supernode) + hUvalues(11) = FIVE; + hUvalues(12) = ONE; + // fifth row (fourth supernode) + hUvalues(13) = FIVE; + + // save U for Supernodal Sptrsv + host_graph_t static_graph(hUcolind, hUrowptr); + U = host_crsmat_t("CrsMatrixU", nrows, hUvalues, static_graph); + + // create handle for Supernodal Sptrsv + bool is_lower_tri = false; + khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + is_lower_tri); + + // X = U*ONES to generate B = A*ONES (on device) + { + RowMapType Urowptr("Urowptr", nrows + 1); + EntriesType Ucolind("Ucolind", nnz_sp); + ValuesType Uvalues("Uvalues", nnz_sp); + + Kokkos::deep_copy(Urowptr, hUrowptr); + Kokkos::deep_copy(Ucolind, hUcolind); + Kokkos::deep_copy(Uvalues, hUvalues); + + crsMat_t mtxU("mtxU", nrows, nrows, nnz_sp, Uvalues, Urowptr, Ucolind); + Kokkos::deep_copy(B, ONE); + KokkosSparse::spmv("N", ONE, mtxU, B, ZERO, X); + } + } + + { + // U in csc (for inverting off-diag) + row_map_view_t hUcolptr("hUcolptr", nrows + 1); + cols_view_t hUrowind("hUrowind", nnz_sp); + values_view_t hUvalues("hUvalues", nnz_sp); + + // colptr + hUcolptr(0) = 0; + hUcolptr(1) = 2; + hUcolptr(2) = 4; + hUcolptr(3) = 7; + hUcolptr(4) = 9; + hUcolptr(5) = 14; + + // colind + // first column (first supernode) + hUrowind(0) = 0; + hUrowind(1) = 1; + // second column (first supernode) + hUrowind(2) = 0; + hUrowind(3) = 1; + // third column (second supernode) + hUrowind(4) = 2; + hUrowind(5) = 0; + hUrowind(6) = 1; + // fourth column (third supernode) + hUrowind(7) = 3; + hUrowind(8) = 2; + // fifth column (fourth supernode) + hUrowind(9) = 4; + hUrowind(10) = 0; + hUrowind(11) = 1; + hUrowind(12) = 2; + hUrowind(13) = 3; + + // values + // first column (first supernode) + hUvalues(0) = FIVE; + hUvalues(1) = ZERO; + // second column (first supernode) + hUvalues(2) = ONE; + hUvalues(3) = FIVE; + // third column (second supernode) + hUvalues(4) = FIVE; + hUvalues(5) = ONE; + hUvalues(6) = ZERO; + // fourth column (third supernode) + hUvalues(7) = FIVE; + hUvalues(8) = ONE; + // fifth column (fourth supernode) + hUvalues(9) = FIVE; + hUvalues(10) = ZERO; + hUvalues(11) = ONE; + hUvalues(12) = ONE; + hUvalues(13) = ONE; + + // store Ut in crsmat + host_graph_t static_graph(hUrowind, hUcolptr); + Ut = host_crsmat_t("CrsMatrixUt", nrows, hUvalues, static_graph); + } +#endif + } + + // Lower tri + { + RowMapType row_map("row_map", nrows + 1); + EntriesType entries("entries", nnz); + ValuesType values("values", nnz); + + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + hrow_map(0) = 0; + hrow_map(1) = 1; + hrow_map(2) = 2; + hrow_map(3) = 4; + hrow_map(4) = 6; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 1; + hentries(2) = 0; + hentries(3) = 2; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 1; + hentries(7) = 2; + hentries(8) = 3; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + Kokkos::deep_copy(row_map, hrow_map); + Kokkos::deep_copy(entries, hentries); + Kokkos::deep_copy(values, hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + ValuesType lhs("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); + + typedef CrsMatrix crsMat_t; + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + + { + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + // FIXME Issues with various integral type combos - algorithm currently + // unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << + "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + */ + + kh.destroy_sptrsv_handle(); + } + + { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, + is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + kh.destroy_sptrsv_handle(); + } + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if (std::is_same::value && + std::is_same::value && + std::is_same::value) { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, + lhs.extent(0)), + ReductionCheck(lhs), sum); + if (sum != lhs.extent(0)) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); + + kh.destroy_sptrsv_handle(); + } +#endif + +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) + { + // L in csc + const scalar_t TWO = scalar_t(2); + const scalar_t FIVE = scalar_t(5); + const size_type nnz_sp = 14; + + row_map_view_t hLcolptr("hUcolptr", nrows + 1); + cols_view_t hLrowind("hUrowind", nnz_sp); + values_view_t hLvalues("hUvalues", nnz_sp); + + // colptr + hLcolptr(0) = 0; + hLcolptr(1) = 4; + hLcolptr(2) = 8; + hLcolptr(3) = 11; + hLcolptr(4) = 13; + hLcolptr(5) = 14; + + // rowind + // first column (first supernode) + hLrowind(0) = 0; + hLrowind(1) = 1; + hLrowind(2) = 2; + hLrowind(3) = 4; + // second column (first supernode) + hLrowind(4) = 0; + hLrowind(5) = 1; + hLrowind(6) = 2; + hLrowind(7) = 4; + // third column (second supernode) + hLrowind(8) = 2; + hLrowind(9) = 3; + hLrowind(10) = 4; + // fourth column (third supernode) + hLrowind(11) = 3; + hLrowind(12) = 4; + // fifth column (fourth supernode) + hLrowind(13) = 4; + + // values + // first column (first supernode) + hLvalues(0) = FIVE; + hLvalues(1) = TWO; + hLvalues(2) = ONE; + hLvalues(3) = ZERO; + // second column (first supernode) + hLvalues(4) = ZERO; + hLvalues(5) = FIVE; + hLvalues(6) = ZERO; + hLvalues(7) = ONE; + // third column (second supernode) + hLvalues(8) = FIVE; + hLvalues(9) = ONE; + hLvalues(10) = ONE; + // fourth column (third supernode) + hLvalues(11) = FIVE; + hLvalues(12) = ONE; + // fifth column (fourth supernode) + hLvalues(13) = FIVE; + + // store Lt in crsmat + host_graph_t static_graph(hLrowind, hLcolptr); + L = host_crsmat_t("CrsMatrixL", nrows, hLvalues, static_graph); + + bool is_lower_tri = true; + khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + is_lower_tri); + + // generate B = A*ONES = L*(U*ONES), where X = U*ONES (on device) + { + RowMapType Lcolptr("Lcolptr", nrows + 1); + EntriesType Lrowind("Lrowind", nnz_sp); + ValuesType Lvalues("Lvalues", nnz_sp); + + Kokkos::deep_copy(Lcolptr, hLcolptr); + Kokkos::deep_copy(Lrowind, hLrowind); + Kokkos::deep_copy(Lvalues, hLvalues); + + crsMat_t mtxL("mtxL", nrows, nrows, nnz_sp, Lvalues, Lcolptr, Lrowind); + KokkosSparse::spmv("T", ONE, mtxL, X, ZERO, B); + } + } + + { + // unit-test for supernode SpTrsv (default) + // > set up supernodes (block size = one) + size_type nsupers = 4; + Kokkos::View supercols("supercols", + 1 + nsupers); + supercols(0) = 0; + supercols(1) = 2; // two columns + supercols(2) = 3; // one column + supercols(3) = 4; // one column + supercols(4) = 5; // one column + int *etree = NULL; // we generate graph internally + + // invert diagonal blocks + bool invert_diag = true; + khL.set_sptrsv_invert_diagonal(invert_diag); + khU.set_sptrsv_invert_diagonal(invert_diag); + + // > symbolic (on host) + sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, + &khL, U.graph, &khU); + // > numeric (on host) + sptrsv_compute(&khL, L); + sptrsv_compute(&khU, U); + Kokkos::fence(); + + // > solve + ValuesType b("b", nrows); + Kokkos::deep_copy(b, B); + Kokkos::deep_copy(X, ZERO); + sptrsv_solve(&khL, &khU, X, b); + Kokkos::fence(); + + // > check + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, X.extent(0)), + ReductionCheck(X), sum); + if (sum != lhs.extent(0)) { + std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." + << lhs.extent(0) << std::endl; + khL.get_sptrsv_handle()->print_algorithm(); + } else { + std::cout << "Supernode Tri Solve SUCCESS" << std::endl; + khL.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(X.extent(0))); + + khL.destroy_sptrsv_handle(); + khU.destroy_sptrsv_handle(); + } + + { + // unit-test for supernode SpTrsv (running TRMM on device for compute) + // > set up supernodes + size_type nsupers = 4; + Kokkos::View supercols("supercols", + 1 + nsupers); + supercols(0) = 0; + supercols(1) = 2; // two columns + supercols(2) = 3; // one column + supercols(3) = 4; // one column + supercols(4) = 5; // one column + int *etree = NULL; // we generate tree internally + + // > create handles + KernelHandle khLd; + KernelHandle khUd; + khLd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); + khUd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, false); + + // > invert diagonal blocks + bool invert_diag = true; + khLd.set_sptrsv_invert_diagonal(invert_diag); + khUd.set_sptrsv_invert_diagonal(invert_diag); + + // > invert off-diagonal blocks + bool invert_offdiag = true; + khUd.set_sptrsv_column_major(true); + khLd.set_sptrsv_invert_offdiagonal(invert_offdiag); + khUd.set_sptrsv_invert_offdiagonal(invert_offdiag); + + // > forcing sptrsv compute to perform TRMM on device + khLd.set_sptrsv_diag_supernode_sizes(1, 1); + khUd.set_sptrsv_diag_supernode_sizes(1, 1); + + // > symbolic (on host) + sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, + &khLd, Ut.graph, &khUd); + // > numeric (on host) + sptrsv_compute(&khLd, L); + sptrsv_compute(&khUd, Ut); + Kokkos::fence(); + + // > solve + ValuesType b("b", nrows); + Kokkos::deep_copy(b, B); + Kokkos::deep_copy(X, ZERO); + sptrsv_solve(&khLd, &khUd, X, b); + Kokkos::fence(); + + // > check + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, X.extent(0)), + ReductionCheck(X), sum); + if (sum != lhs.extent(0)) { + std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." + << lhs.extent(0) << std::endl; + khLd.get_sptrsv_handle()->print_algorithm(); + } else { + std::cout << "Supernode Tri Solve SUCCESS" << std::endl; + khLd.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(X.extent(0))); + + khLd.destroy_sptrsv_handle(); + khUd.destroy_sptrsv_handle(); + } +#endif + } +} + +template +void run_test_sptrsv_streams(int test_algo, int nstreams) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; + using crsMat_t = CrsMatrix; + + // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); + if (env_omp_num_threads != nullptr) { + int num_threads = std::atoi(env_omp_num_threads); + if (num_threads < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; + } + } + } +#endif + if (!run_streams_test) + return; + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + + const size_type nrows = 5; + const size_type nnz = 10; + + std::vector instances; + if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else // (nstreams == 4) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector row_map_v(nstreams); + std::vector entries_v(nstreams); + std::vector values_v(nstreams); + std::vector rhs_v(nstreams); + std::vector lhs_v(nstreams); + + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); + + // Upper tri + { + hrow_map(0) = 0; + hrow_map(1) = 2; + hrow_map(2) = 4; + hrow_map(3) = 7; + hrow_map(4) = 9; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 1; + hentries(3) = 4; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 4; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + for (int i = 0; i < nstreams; i++) { + // Allocate U + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = false; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy( + 0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + if (sum != lhs_v[i].extent(0)) { + std::cout << "Upper Tri Solve FAILURE on stream " << i << std::endl; + kh_v[i].get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + + kh_v[i].destroy_sptrsv_handle(); + } + } + + // Lower tri + { + hrow_map(0) = 0; + hrow_map(1) = 1; + hrow_map(2) = 2; + hrow_map(3) = 4; + hrow_map(4) = 6; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 1; + hentries(2) = 0; + hentries(3) = 2; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 1; + hentries(7) = 2; + hentries(8) = 3; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + for (int i = 0; i < nstreams; i++) { + // Allocate L + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = true; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy( + 0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + if (sum != lhs_v[i].extent(0)) { + std::cout << "Lower Tri Solve FAILURE on stream " << i << std::endl; + kh_v[i].get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + + kh_v[i].destroy_sptrsv_handle(); + } + } +} + +} // namespace Test + +template +void test_sptrsv() { + Test::run_test_sptrsv(); + // Test::run_test_sptrsv_mtx(); +} + +template +void test_sptrsv_streams() { + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 2); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 3); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 4); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 2); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 3); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 4); + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + if (std::is_same::value && + std::is_same::value) { + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 2); + + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 3); + + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 4); + } +#endif +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_sptrsv(); \ + test_sptrsv_streams(); \ + } + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST From e5e424a4dcb344f06a463670858f6a64f28fca38 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Wed, 30 Aug 2023 03:38:36 -0600 Subject: [PATCH 158/231] Delete redundant files --- sparse/unit_test/Test_Sparse_spiluk.hpp_ | 517 -------- sparse/unit_test/Test_Sparse_sptrsv.hpp_ | 1359 ---------------------- 2 files changed, 1876 deletions(-) delete mode 100644 sparse/unit_test/Test_Sparse_spiluk.hpp_ delete mode 100644 sparse/unit_test/Test_Sparse_sptrsv.hpp_ diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp_ b/sparse/unit_test/Test_Sparse_spiluk.hpp_ deleted file mode 100644 index 3115bc9649..0000000000 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp_ +++ /dev/null @@ -1,517 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include - -#include -#include - -#include "KokkosSparse_Utils.hpp" -#include "KokkosSparse_CrsMatrix.hpp" -#include -#include "KokkosBlas1_nrm2.hpp" -#include "KokkosSparse_spmv.hpp" -#include "KokkosSparse_spiluk.hpp" - -#include - -using namespace KokkosSparse; -using namespace KokkosSparse::Experimental; -using namespace KokkosKernels; -using namespace KokkosKernels::Experimental; - -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #define kokkos_complex_float Kokkos::complex -// #endif - -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; - -namespace Test { - -template -void run_test_spiluk() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; - typedef Kokkos::ArithTraits AT; - - const size_type nrows = 9; - const size_type nnz = 21; - - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - scalar_t MONE = scalar_t(-1); - - hrow_map(0) = 0; - hrow_map(1) = 3; - hrow_map(2) = 5; - hrow_map(3) = 6; - hrow_map(4) = 9; - hrow_map(5) = 11; - hrow_map(6) = 13; - hrow_map(7) = 15; - hrow_map(8) = 18; - hrow_map(9) = nnz; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 5; - hentries(3) = 1; - hentries(4) = 6; - hentries(5) = 2; - hentries(6) = 0; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 0; - hentries(10) = 4; - hentries(11) = 1; - hentries(12) = 5; - hentries(13) = 2; - hentries(14) = 6; - hentries(15) = 3; - hentries(16) = 4; - hentries(17) = 7; - hentries(18) = 3; - hentries(19) = 4; - hentries(20) = 8; - - hvalues(0) = 10; - hvalues(1) = 0.3; - hvalues(2) = 0.6; - hvalues(3) = 11; - hvalues(4) = 0.7; - hvalues(5) = 12; - hvalues(6) = 5; - hvalues(7) = 13; - hvalues(8) = 1; - hvalues(9) = 4; - hvalues(10) = 14; - hvalues(11) = 3; - hvalues(12) = 15; - hvalues(13) = 7; - hvalues(14) = 16; - hvalues(15) = 6; - hvalues(16) = 5; - hvalues(17) = 17; - hvalues(18) = 2; - hvalues(19) = 2.5; - hvalues(20) = 18; - - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> - KernelHandle; - - KernelHandle kh; - - // SPILUKAlgorithm::SEQLVLSCHD_RP - { - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4 * nrows, - 4 * nrows); - - auto spiluk_handle = kh.get_spiluk_handle(); - - // Allocate L and U as outputs - RowMapType L_row_map("L_row_map", nrows + 1); - EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); - RowMapType U_row_map("U_row_map", nrows + 1); - EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; - - spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, - U_row_map, U_entries); - - Kokkos::fence(); - - Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(L_values, spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - Kokkos::resize(U_values, spiluk_handle->get_nnzU()); - - spiluk_handle->print_algorithm(); - spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, - L_entries, L_values, U_row_map, U_entries, U_values); - - Kokkos::fence(); - - // Checking - typedef CrsMatrix crsMat_t; - crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, - L_row_map, L_entries); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, - U_row_map, U_entries); - - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); - - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); - - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); - - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); - - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); - - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); - - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); - - kh.destroy_spiluk_handle(); - } - - // SPILUKAlgorithm::SEQLVLSCHD_TP1 - { - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4 * nrows, - 4 * nrows); - - auto spiluk_handle = kh.get_spiluk_handle(); - - // Allocate L and U as outputs - RowMapType L_row_map("L_row_map", nrows + 1); - EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); - RowMapType U_row_map("U_row_map", nrows + 1); - EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; - - spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, - U_row_map, U_entries); - - Kokkos::fence(); - - Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(L_values, spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - Kokkos::resize(U_values, spiluk_handle->get_nnzU()); - - spiluk_handle->print_algorithm(); - spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, - L_entries, L_values, U_row_map, U_entries, U_values); - - Kokkos::fence(); - - // Checking - typedef CrsMatrix crsMat_t; - crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, - L_row_map, L_entries); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, - U_row_map, U_entries); - - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); - - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); - - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); - - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); - - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); - - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); - - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); - - kh.destroy_spiluk_handle(); - } -} - -template -void run_test_spiluk_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - using crsMat_t = CrsMatrix; - using AT = Kokkos::ArithTraits; - - // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition - bool run_streams_test = true; -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); - if (env_omp_num_threads != nullptr) { - int num_threads = std::atoi(env_omp_num_threads); - if (num_threads < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; - } - } - } -#endif - if (!run_streams_test) - return; - - const size_type nrows = 9; - const size_type nnz = 21; - - std::vector instances; - if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); - - std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); - std::vector A_row_map_v(nstreams); - std::vector A_entries_v(nstreams); - std::vector A_values_v(nstreams); - std::vector L_row_map_v(nstreams); - std::vector L_entries_v(nstreams); - std::vector L_values_v(nstreams); - std::vector U_row_map_v(nstreams); - std::vector U_entries_v(nstreams); - std::vector U_values_v(nstreams); - - RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); - EntriesType_hostmirror hentries("hentries", nnz); - ValuesType_hostmirror hvalues("hvalues", nnz); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - scalar_t MONE = scalar_t(-1); - - hrow_map(0) = 0; - hrow_map(1) = 3; - hrow_map(2) = 5; - hrow_map(3) = 6; - hrow_map(4) = 9; - hrow_map(5) = 11; - hrow_map(6) = 13; - hrow_map(7) = 15; - hrow_map(8) = 18; - hrow_map(9) = nnz; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 5; - hentries(3) = 1; - hentries(4) = 6; - hentries(5) = 2; - hentries(6) = 0; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 0; - hentries(10) = 4; - hentries(11) = 1; - hentries(12) = 5; - hentries(13) = 2; - hentries(14) = 6; - hentries(15) = 3; - hentries(16) = 4; - hentries(17) = 7; - hentries(18) = 3; - hentries(19) = 4; - hentries(20) = 8; - - hvalues(0) = 10; - hvalues(1) = 0.3; - hvalues(2) = 0.6; - hvalues(3) = 11; - hvalues(4) = 0.7; - hvalues(5) = 12; - hvalues(6) = 5; - hvalues(7) = 13; - hvalues(8) = 1; - hvalues(9) = 4; - hvalues(10) = 14; - hvalues(11) = 3; - hvalues(12) = 15; - hvalues(13) = 7; - hvalues(14) = 16; - hvalues(15) = 6; - hvalues(16) = 5; - hvalues(17) = 17; - hvalues(18) = 2; - hvalues(19) = 2.5; - hvalues(20) = 18; - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; - - for (int i = 0; i < nstreams; i++) { - // Allocate A as input - A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); - A_entries_v[i] = EntriesType("A_entries", nnz); - A_values_v[i] = ValuesType("A_values", nnz); - - // Copy from host to device - Kokkos::deep_copy(A_row_map_v[i], hrow_map); - Kokkos::deep_copy(A_entries_v[i], hentries); - Kokkos::deep_copy(A_values_v[i], hvalues); - - // Create handle - kh_v[i] = KernelHandle(); - if (test_algo == 0) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, - 4 * nrows, 4 * nrows); - else if (test_algo == 1) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, - 4 * nrows, 4 * nrows); - kh_ptr_v[i] = &kh_v[i]; - - auto spiluk_handle = kh_v[i].get_spiluk_handle(); - std::cout << " Stream " << i << ": "; - spiluk_handle->print_algorithm(); - - // Allocate L and U as outputs - L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); - L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); - L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); - U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); - U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); - U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); - - // Symbolic phase - spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], - L_row_map_v[i], L_entries_v[i], U_row_map_v[i], - U_entries_v[i], nstreams); - - Kokkos::fence(); - - Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); - Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); - Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); - } // Done handle creation and spiluk_symbolic on all streams - - // Numeric phase - spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, - A_entries_v, A_values_v, L_row_map_v, L_entries_v, - L_values_v, U_row_map_v, U_entries_v, U_values_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - auto spiluk_handle = kh_v[i].get_spiluk_handle(); - crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], - A_entries_v[i]); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], - L_row_map_v[i], L_entries_v[i]); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], - U_row_map_v[i], U_entries_v[i]); - - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); - - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); - - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); - - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); - - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); - - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); - - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); - - kh_v[i].destroy_spiluk_handle(); - } -} - -} // namespace Test - -template -void test_spiluk() { - Test::run_test_spiluk(); -} - -template -void test_spiluk_streams() { - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; - Test::run_test_spiluk_streams(0, 2); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; - Test::run_test_spiluk_streams(0, 3); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; - Test::run_test_spiluk_streams(0, 4); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; - Test::run_test_spiluk_streams(1, 2); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; - Test::run_test_spiluk_streams(1, 3); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; - Test::run_test_spiluk_streams(1, 4); -} - -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_spiluk(); \ - test_spiluk_streams(); \ - } - -#define NO_TEST_COMPLEX - -#include - -#undef KOKKOSKERNELS_EXECUTE_TEST -#undef NO_TEST_COMPLEX diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp_ b/sparse/unit_test/Test_Sparse_sptrsv.hpp_ deleted file mode 100644 index 2425fb4c27..0000000000 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp_ +++ /dev/null @@ -1,1359 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include - -#include -#include - -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosSparse_Utils.hpp" -#include "KokkosSparse_spmv.hpp" -#include "KokkosSparse_CrsMatrix.hpp" - -#include "KokkosSparse_sptrsv.hpp" -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) -#include "KokkosSparse_sptrsv_supernode.hpp" -#endif - -#include - -using namespace KokkosSparse; -using namespace KokkosSparse::Experimental; -using namespace KokkosKernels; -using namespace KokkosKernels::Impl; -using namespace KokkosKernels::Experimental; - -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #endif -// #ifndef kokkos_complex_float -// #define kokkos_complex_float Kokkos::complex -// #endif - -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; - -namespace Test { - -#if 0 -template -void run_test_sptrsv_mtx() { - - typedef typename KokkosSparse::CrsMatrix crsmat_t; - typedef typename crsmat_t::StaticCrsGraphType graph_t; - - //typedef Kokkos::View< size_type*, device > RowMapType; - //typedef Kokkos::View< lno_t*, device > EntriesType; - typedef Kokkos::View< scalar_t*, device > ValuesType; - - // Lower tri - std::cout << "LowerTriTest Begin" << std::endl; - { - -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-offshore-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Transport-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Fault_639amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-thermal2-amd.mtx"; - std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-dielFilterV2real-amd.mtx"; - std::cout << "Matrix file: " << mtx_filename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix - graph_t lgraph = triMtx.graph; // in_graph - - auto row_map = lgraph.row_map; - auto entries = lgraph.entries; - auto values = triMtx.values; - - const size_type nrows = lgraph.numRows(); -// const size_type nnz = triMtx.nnz(); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - - std::cout << "UnitTest nrows = " << nrows << std::endl; - - KernelHandle kh; - bool is_lower_tri = true; - std::cout << "Create handle" << std::endl; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - - std::cout << "Prepare linear system" << std::endl; - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - -// typedef CrsMatrix crsMat_t; -// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - - std::cout << "SPMV" << std::endl; - KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - - std::cout << "TriSolve Symbolic" << std::endl; - Kokkos::Timer timer; - sptrsv_symbolic( &kh, row_map, entries ); - std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl; - - std::cout << "TriSolve Solve" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve TEAMPOLICY! Time: " << timer.seconds() << std::endl; - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - - kh.destroy_sptrsv_handle(); - } - // Upper tri - std::cout << "UpperTriTest Begin" << std::endl; - { -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-offshore-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Transport-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Fault_639amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-thermal2-amd.mtx"; - std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-dielFilterV2real-amd.mtx"; - std::cout << "Matrix file: " << mtx_filename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix - graph_t lgraph = triMtx.graph; // in_graph - - auto row_map = lgraph.row_map; - auto entries = lgraph.entries; - auto values = triMtx.values; - - const size_type nrows = lgraph.numRows(); -// const size_type nnz = triMtx.nnz(); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - - std::cout << "UnitTest nrows = " << nrows << std::endl; - - KernelHandle kh; - bool is_lower_tri = false; - std::cout << "Create handle" << std::endl; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - - std::cout << "Prepare linear system" << std::endl; - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - -// typedef CrsMatrix crsMat_t; -// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - std::cout << "SPMV" << std::endl; - KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - - std::cout << "TriSolve Symbolic" << std::endl; - Kokkos::Timer timer; - sptrsv_symbolic( &kh, row_map, entries ); - std::cout << "UTRI Symbolic Time: " << timer.seconds() << std::endl; - - std::cout << "TriSolve Solve" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHD_TP1 Time: " << timer.seconds() << std::endl; - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - - kh.destroy_sptrsv_handle(); - } - -} -#endif - -namespace { -template -struct ReductionCheck { - using lno_t = OrdinalType; - using value_type = ValueType; - - ViewType lhs; - - ReductionCheck(const ViewType &lhs_) : lhs(lhs_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(lno_t i, value_type &tsum) const { tsum += lhs(i); } -}; -} // namespace - -template -void run_test_sptrsv() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - const size_type nrows = 5; - const size_type nnz = 10; - - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space>; - -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - using host_crsmat_t = typename KernelHandle::SPTRSVHandleType::host_crsmat_t; - using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; - - using row_map_view_t = typename host_graph_t::row_map_type::non_const_type; - using cols_view_t = typename host_graph_t::entries_type::non_const_type; - using values_view_t = typename host_crsmat_t::values_type::non_const_type; - - // L & U handle for supernodal SpTrsv - KernelHandle khL; - KernelHandle khU; - - // right-hand-side and solution - ValuesType B("rhs", nrows); - ValuesType X("sol", nrows); - - // host CRS for L & U - host_crsmat_t L, U, Ut; -#endif - - // Upper tri - { - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - hrow_map(0) = 0; - hrow_map(1) = 2; - hrow_map(2) = 4; - hrow_map(3) = 7; - hrow_map(4) = 9; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 1; - hentries(3) = 4; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 4; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } - - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - - typedef CrsMatrix crsMat_t; - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - - { - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << - "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - */ - - kh.destroy_sptrsv_handle(); - } - - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } -#endif - -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - const scalar_t FIVE = scalar_t(5); - const size_type nnz_sp = 14; - { - // U in csr - row_map_view_t hUrowptr("hUrowptr", nrows + 1); - cols_view_t hUcolind("hUcolind", nnz_sp); - values_view_t hUvalues("hUvalues", nnz_sp); - - // rowptr - hUrowptr(0) = 0; - hUrowptr(1) = 4; - hUrowptr(2) = 8; - hUrowptr(3) = 11; - hUrowptr(4) = 13; - hUrowptr(5) = 14; - - // colind - // first row (first supernode) - hUcolind(0) = 0; - hUcolind(1) = 1; - hUcolind(2) = 2; - hUcolind(3) = 4; - // second row (first supernode) - hUcolind(4) = 0; - hUcolind(5) = 1; - hUcolind(6) = 2; - hUcolind(7) = 4; - // third row (second supernode) - hUcolind(8) = 2; - hUcolind(9) = 3; - hUcolind(10) = 4; - // fourth row (third supernode) - hUcolind(11) = 3; - hUcolind(12) = 4; - // fifth row (fourth supernode) - hUcolind(13) = 4; - - // values - // first row (first supernode) - hUvalues(0) = FIVE; - hUvalues(1) = ONE; - hUvalues(2) = ONE; - hUvalues(3) = ZERO; - // second row (first supernode) - hUvalues(4) = ZERO; - hUvalues(5) = FIVE; - hUvalues(6) = ZERO; - hUvalues(7) = ONE; - // third row (second supernode) - hUvalues(8) = FIVE; - hUvalues(9) = ONE; - hUvalues(10) = ONE; - // fourth row (third supernode) - hUvalues(11) = FIVE; - hUvalues(12) = ONE; - // fifth row (fourth supernode) - hUvalues(13) = FIVE; - - // save U for Supernodal Sptrsv - host_graph_t static_graph(hUcolind, hUrowptr); - U = host_crsmat_t("CrsMatrixU", nrows, hUvalues, static_graph); - - // create handle for Supernodal Sptrsv - bool is_lower_tri = false; - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - is_lower_tri); - - // X = U*ONES to generate B = A*ONES (on device) - { - RowMapType Urowptr("Urowptr", nrows + 1); - EntriesType Ucolind("Ucolind", nnz_sp); - ValuesType Uvalues("Uvalues", nnz_sp); - - Kokkos::deep_copy(Urowptr, hUrowptr); - Kokkos::deep_copy(Ucolind, hUcolind); - Kokkos::deep_copy(Uvalues, hUvalues); - - crsMat_t mtxU("mtxU", nrows, nrows, nnz_sp, Uvalues, Urowptr, Ucolind); - Kokkos::deep_copy(B, ONE); - KokkosSparse::spmv("N", ONE, mtxU, B, ZERO, X); - } - } - - { - // U in csc (for inverting off-diag) - row_map_view_t hUcolptr("hUcolptr", nrows + 1); - cols_view_t hUrowind("hUrowind", nnz_sp); - values_view_t hUvalues("hUvalues", nnz_sp); - - // colptr - hUcolptr(0) = 0; - hUcolptr(1) = 2; - hUcolptr(2) = 4; - hUcolptr(3) = 7; - hUcolptr(4) = 9; - hUcolptr(5) = 14; - - // colind - // first column (first supernode) - hUrowind(0) = 0; - hUrowind(1) = 1; - // second column (first supernode) - hUrowind(2) = 0; - hUrowind(3) = 1; - // third column (second supernode) - hUrowind(4) = 2; - hUrowind(5) = 0; - hUrowind(6) = 1; - // fourth column (third supernode) - hUrowind(7) = 3; - hUrowind(8) = 2; - // fifth column (fourth supernode) - hUrowind(9) = 4; - hUrowind(10) = 0; - hUrowind(11) = 1; - hUrowind(12) = 2; - hUrowind(13) = 3; - - // values - // first column (first supernode) - hUvalues(0) = FIVE; - hUvalues(1) = ZERO; - // second column (first supernode) - hUvalues(2) = ONE; - hUvalues(3) = FIVE; - // third column (second supernode) - hUvalues(4) = FIVE; - hUvalues(5) = ONE; - hUvalues(6) = ZERO; - // fourth column (third supernode) - hUvalues(7) = FIVE; - hUvalues(8) = ONE; - // fifth column (fourth supernode) - hUvalues(9) = FIVE; - hUvalues(10) = ZERO; - hUvalues(11) = ONE; - hUvalues(12) = ONE; - hUvalues(13) = ONE; - - // store Ut in crsmat - host_graph_t static_graph(hUrowind, hUcolptr); - Ut = host_crsmat_t("CrsMatrixUt", nrows, hUvalues, static_graph); - } -#endif - } - - // Lower tri - { - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - hrow_map(0) = 0; - hrow_map(1) = 1; - hrow_map(2) = 2; - hrow_map(3) = 4; - hrow_map(4) = 6; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 1; - hentries(2) = 0; - hentries(3) = 2; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 1; - hentries(7) = 2; - hentries(8) = 3; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } - - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - - typedef CrsMatrix crsMat_t; - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - - { - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << - "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - */ - - kh.destroy_sptrsv_handle(); - } - - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } -#endif - -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - { - // L in csc - const scalar_t TWO = scalar_t(2); - const scalar_t FIVE = scalar_t(5); - const size_type nnz_sp = 14; - - row_map_view_t hLcolptr("hUcolptr", nrows + 1); - cols_view_t hLrowind("hUrowind", nnz_sp); - values_view_t hLvalues("hUvalues", nnz_sp); - - // colptr - hLcolptr(0) = 0; - hLcolptr(1) = 4; - hLcolptr(2) = 8; - hLcolptr(3) = 11; - hLcolptr(4) = 13; - hLcolptr(5) = 14; - - // rowind - // first column (first supernode) - hLrowind(0) = 0; - hLrowind(1) = 1; - hLrowind(2) = 2; - hLrowind(3) = 4; - // second column (first supernode) - hLrowind(4) = 0; - hLrowind(5) = 1; - hLrowind(6) = 2; - hLrowind(7) = 4; - // third column (second supernode) - hLrowind(8) = 2; - hLrowind(9) = 3; - hLrowind(10) = 4; - // fourth column (third supernode) - hLrowind(11) = 3; - hLrowind(12) = 4; - // fifth column (fourth supernode) - hLrowind(13) = 4; - - // values - // first column (first supernode) - hLvalues(0) = FIVE; - hLvalues(1) = TWO; - hLvalues(2) = ONE; - hLvalues(3) = ZERO; - // second column (first supernode) - hLvalues(4) = ZERO; - hLvalues(5) = FIVE; - hLvalues(6) = ZERO; - hLvalues(7) = ONE; - // third column (second supernode) - hLvalues(8) = FIVE; - hLvalues(9) = ONE; - hLvalues(10) = ONE; - // fourth column (third supernode) - hLvalues(11) = FIVE; - hLvalues(12) = ONE; - // fifth column (fourth supernode) - hLvalues(13) = FIVE; - - // store Lt in crsmat - host_graph_t static_graph(hLrowind, hLcolptr); - L = host_crsmat_t("CrsMatrixL", nrows, hLvalues, static_graph); - - bool is_lower_tri = true; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - is_lower_tri); - - // generate B = A*ONES = L*(U*ONES), where X = U*ONES (on device) - { - RowMapType Lcolptr("Lcolptr", nrows + 1); - EntriesType Lrowind("Lrowind", nnz_sp); - ValuesType Lvalues("Lvalues", nnz_sp); - - Kokkos::deep_copy(Lcolptr, hLcolptr); - Kokkos::deep_copy(Lrowind, hLrowind); - Kokkos::deep_copy(Lvalues, hLvalues); - - crsMat_t mtxL("mtxL", nrows, nrows, nnz_sp, Lvalues, Lcolptr, Lrowind); - KokkosSparse::spmv("T", ONE, mtxL, X, ZERO, B); - } - } - - { - // unit-test for supernode SpTrsv (default) - // > set up supernodes (block size = one) - size_type nsupers = 4; - Kokkos::View supercols("supercols", - 1 + nsupers); - supercols(0) = 0; - supercols(1) = 2; // two columns - supercols(2) = 3; // one column - supercols(3) = 4; // one column - supercols(4) = 5; // one column - int *etree = NULL; // we generate graph internally - - // invert diagonal blocks - bool invert_diag = true; - khL.set_sptrsv_invert_diagonal(invert_diag); - khU.set_sptrsv_invert_diagonal(invert_diag); - - // > symbolic (on host) - sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, - &khL, U.graph, &khU); - // > numeric (on host) - sptrsv_compute(&khL, L); - sptrsv_compute(&khU, U); - Kokkos::fence(); - - // > solve - ValuesType b("b", nrows); - Kokkos::deep_copy(b, B); - Kokkos::deep_copy(X, ZERO); - sptrsv_solve(&khL, &khU, X, b); - Kokkos::fence(); - - // > check - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, X.extent(0)), - ReductionCheck(X), sum); - if (sum != lhs.extent(0)) { - std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." - << lhs.extent(0) << std::endl; - khL.get_sptrsv_handle()->print_algorithm(); - } else { - std::cout << "Supernode Tri Solve SUCCESS" << std::endl; - khL.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(X.extent(0))); - - khL.destroy_sptrsv_handle(); - khU.destroy_sptrsv_handle(); - } - - { - // unit-test for supernode SpTrsv (running TRMM on device for compute) - // > set up supernodes - size_type nsupers = 4; - Kokkos::View supercols("supercols", - 1 + nsupers); - supercols(0) = 0; - supercols(1) = 2; // two columns - supercols(2) = 3; // one column - supercols(3) = 4; // one column - supercols(4) = 5; // one column - int *etree = NULL; // we generate tree internally - - // > create handles - KernelHandle khLd; - KernelHandle khUd; - khLd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); - khUd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, false); - - // > invert diagonal blocks - bool invert_diag = true; - khLd.set_sptrsv_invert_diagonal(invert_diag); - khUd.set_sptrsv_invert_diagonal(invert_diag); - - // > invert off-diagonal blocks - bool invert_offdiag = true; - khUd.set_sptrsv_column_major(true); - khLd.set_sptrsv_invert_offdiagonal(invert_offdiag); - khUd.set_sptrsv_invert_offdiagonal(invert_offdiag); - - // > forcing sptrsv compute to perform TRMM on device - khLd.set_sptrsv_diag_supernode_sizes(1, 1); - khUd.set_sptrsv_diag_supernode_sizes(1, 1); - - // > symbolic (on host) - sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, - &khLd, Ut.graph, &khUd); - // > numeric (on host) - sptrsv_compute(&khLd, L); - sptrsv_compute(&khUd, Ut); - Kokkos::fence(); - - // > solve - ValuesType b("b", nrows); - Kokkos::deep_copy(b, B); - Kokkos::deep_copy(X, ZERO); - sptrsv_solve(&khLd, &khUd, X, b); - Kokkos::fence(); - - // > check - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, X.extent(0)), - ReductionCheck(X), sum); - if (sum != lhs.extent(0)) { - std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." - << lhs.extent(0) << std::endl; - khLd.get_sptrsv_handle()->print_algorithm(); - } else { - std::cout << "Supernode Tri Solve SUCCESS" << std::endl; - khLd.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(X.extent(0))); - - khLd.destroy_sptrsv_handle(); - khUd.destroy_sptrsv_handle(); - } -#endif - } -} - -template -void run_test_sptrsv_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - using crsMat_t = CrsMatrix; - - // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition - bool run_streams_test = true; -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); - if (env_omp_num_threads != nullptr) { - int num_threads = std::atoi(env_omp_num_threads); - if (num_threads < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; - } - } - } -#endif - if (!run_streams_test) - return; - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - const size_type nrows = 5; - const size_type nnz = 10; - - std::vector instances; - if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else // (nstreams == 4) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); - - std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); - std::vector row_map_v(nstreams); - std::vector entries_v(nstreams); - std::vector values_v(nstreams); - std::vector rhs_v(nstreams); - std::vector lhs_v(nstreams); - - RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); - EntriesType_hostmirror hentries("hentries", nnz); - ValuesType_hostmirror hvalues("hvalues", nnz); - - // Upper tri - { - hrow_map(0) = 0; - hrow_map(1) = 2; - hrow_map(2) = 4; - hrow_map(3) = 7; - hrow_map(4) = 9; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 1; - hentries(3) = 4; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 4; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } - - for (int i = 0; i < nstreams; i++) { - // Allocate U - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); - - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = false; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy( - 0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - if (sum != lhs_v[i].extent(0)) { - std::cout << "Upper Tri Solve FAILURE on stream " << i << std::endl; - kh_v[i].get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); - - kh_v[i].destroy_sptrsv_handle(); - } - } - - // Lower tri - { - hrow_map(0) = 0; - hrow_map(1) = 1; - hrow_map(2) = 2; - hrow_map(3) = 4; - hrow_map(4) = 6; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 1; - hentries(2) = 0; - hentries(3) = 2; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 1; - hentries(7) = 2; - hentries(8) = 3; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } - - for (int i = 0; i < nstreams; i++) { - // Allocate L - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); - - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = true; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy( - 0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - if (sum != lhs_v[i].extent(0)) { - std::cout << "Lower Tri Solve FAILURE on stream " << i << std::endl; - kh_v[i].get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); - - kh_v[i].destroy_sptrsv_handle(); - } - } -} - -} // namespace Test - -template -void test_sptrsv() { - Test::run_test_sptrsv(); - // Test::run_test_sptrsv_mtx(); -} - -template -void test_sptrsv_streams() { - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 2); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 3); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 4); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 2); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 3); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 4); - -#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - if (std::is_same::value && - std::is_same::value) { - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 2); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 3); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 4); - } -#endif -} - -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_sptrsv(); \ - test_sptrsv_streams(); \ - } - -#include - -#undef KOKKOSKERNELS_EXECUTE_TEST From 7dfeca33620fe6bdf3f0622d96b271b75421b915 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 30 Aug 2023 08:16:17 -0700 Subject: [PATCH 159/231] Remove unused type alias AT --- sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp index f7b48c6945..69d8eabb0a 100644 --- a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp +++ b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -28,7 +28,6 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { using EntriesType_hm = typename EntriesType::HostMirror; using ValuesType_hm = typename ValuesType::HostMirror; using crsMat_t = CrsMatrix; - using AT = Kokkos::ArithTraits; crsMat_t A; std::vector DiagBlks(nblocks); From 1c79f55864ff4ff7b378b5685331d8396f7f4726 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 30 Aug 2023 17:54:20 -0600 Subject: [PATCH 160/231] update testing scripts cm_generate_makefile: * Add new options to pass extra cmake flags to kokkos or kokkos-kernels * Add option to enable deprecated code warnings (default is off) * Update deprecated code option to 4 cm_test_all_sandia: * Source lmod script on MI250 queue (unavailable on login) --- cm_generate_makefile.bash | 22 ++++++++++++++++++---- scripts/cm_test_all_sandia | 1 + 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index adb1678908..3358ae2eb8 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -367,8 +367,11 @@ display_help_text() { echo "--disable-perftests: Do not build Kokkos Kernels performance tests" echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" + echo "--deprecated-code-warnings Enable deprecated code warnings (disabled by default)" echo "--export-compile-commands: export cmake compile_commands.json file" echo "--enable-docs: build the Kokkos Kernels developer documentation (requires sphinx, doxygen)" + echo "--cmake-flags=[CMAKE Command options]: Set Kokkos Kernels cmake options not handled by script" + echo "--kokkos-cmake-flags=[CMAKE Command options]: Set Kokkos cmake options not handled by script" } @@ -385,6 +388,7 @@ KOKKOSKERNELS_DO_DOCS=OFF CMAKE_EXPORT_COMPILE_COMMANDS=OFF #Build static libraries by default +# Shared libraries are required for Sycl on Intel BUILD_SHARED_LIBRARIES=OFF KOKKOS_MAKEINSTALL_J=4 @@ -396,6 +400,7 @@ WITH_CUDA_BACKEND=OFF WITH_HIP_BACKEND=OFF KOKKOS_DEPRECATED_CODE=OFF +KOKKOS_DEPRECATED_CODE_WARNINGS=OFF while [[ $# > 0 ]] do @@ -523,6 +528,12 @@ do --release) KOKKOSKERNELS_RELEASE=ON ;; + --cmake-flags*) + PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; + --kokkos-cmake-flags*) + KOKKOS_PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; --kokkos-make-j*) echo "${key} parallel level for kokkos install" KOKKOS_MAKEINSTALL_J="${key#*=}" @@ -574,6 +585,9 @@ do --deprecated-code) KOKKOS_DEPRECATED_CODE=ON ;; + --deprecated-code-warnings) + KOKKOS_DEPRECATED_CODE_WARNINGS=ON + ;; --enable-docs) KOKKOSKERNELS_DO_DOCS=ON ;; @@ -797,9 +811,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J @@ -824,6 +838,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOSKERNELS_PATH} diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 98c5db89df..15bbb53711 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -178,6 +178,7 @@ fi if [[ "$HOSTNAME" == fat* ]]; then # Caraway MI250 queues MACHINE=caraway + source /etc/profile.d/lmod.sh fi if [[ "$HOSTNAME" == lean* ]]; then # Caraway MI210 queues From d632c0e5c967a3b7a4d5f517a0f9f97faca2c43b Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 5 Sep 2023 13:47:33 -0700 Subject: [PATCH 161/231] Address Evan's comments --- sparse/src/KokkosSparse_Utils.hpp | 174 +++++++++++------- .../Test_Sparse_extractCrsDiagonalBlocks.hpp | 2 - 2 files changed, 106 insertions(+), 70 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 65e7e4243d..7034b50ae2 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2330,13 +2330,84 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, } } +/** + * @brief Count the non-zeros of a sub-block in a CRS matrix and find the first and last column indices at each row of the sub-block + * This is a host function used by the kk_extract_diagonal_blocks_crsmatrix_sequential() + */ +template +void kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential(const row_map_type &A_row_map,const entries_type &A_entries, const ordinal_type &blk_row_start, const ordinal_type &blk_col_start, const ordinal_type &blk_nrows, const ordinal_type &blk_ncols, size_type &blk_nnz, offset_view1d_type &first_indices, offset_view1d_type &last_indices) { + // Rowmap of i-th row-oriented sub-matrix + auto A_row_map_sub = Kokkos::subview(A_row_map, Kokkos::make_pair(blk_row_start, blk_row_start + blk_nrows + 1)); + + blk_nnz = 0; + + for (ordinal_type j = 0; j < blk_nrows; j++) { // loop through each row + size_type k1 = A_row_map_sub(j); + size_type k2 = A_row_map_sub(j + 1); + size_type k; + // Assume column indices are sorted in ascending order + // Find the position of the start column in the row + for (k = k1; k < k2; k++) { + ordinal_type col = A_entries(k); + if (col >= blk_col_start) { + break; + } + } + first_indices(j) = k; + // Find the position of the last column in the row + for (k = k2 - 1; k >= k1; k--) { + ordinal_type col = A_entries(k); + if (col < blk_col_start + blk_ncols) { + break; + } + } + last_indices(j) = k; + blk_nnz += (last_indices(j) - first_indices(j) + 1); + } +} + +/** + * @brief Extract a CRS sub-block from a CRS matrix + * This is a host function used by the kk_extract_diagonal_blocks_crsmatrix_sequential() + */ +template +void kk_extract_subblock_crsmatrix_sequential(const entries_type &A_entries, const values_type &A_values, const ordinal_type &blk_col_start, const ordinal_type &blk_nrows, const size_type &blk_nnz, const offset_view1d_type &first_indices, const offset_view1d_type &last_indices, out_row_map_type &blk_row_map, out_entries_type &blk_entries, out_values_type &blk_values) { + // - create out_row_map + // - copy A_entries to out_entries and update out_entries with local column indices + // - copy A_values to out_values + size_type first_ = 0; + for (ordinal_type j = 0; j < blk_nrows; j++) { // loop through each row + size_type nnz = last_indices(j) - first_indices(j) + 1; + blk_row_map(j) = first_; + for (size_type k = 0; k < nnz; k++) { + blk_entries(first_ + k) = A_entries(first_indices(j) + k) - blk_col_start; + blk_values(first_ + k) = A_values(first_indices(j) + k); + } + first_ += nnz; + } + blk_row_map(blk_nrows) = blk_nnz; // last element +} + /** * @brief Extract the diagonal blocks out of a crs matrix. * This is a blocking function that runs on the host. * - * @tparam crsMat_t The type of the CRS matrix - * @param A [in] The CrsMatrix. - * @param DiagBlk_v [out] The vector of extracted the CRS diagonal blocks. + * @tparam crsMat_t The type of the CRS matrix. + * @param A [in] The square CrsMatrix. It is expected that column indices are + * in ascending order + * @param DiagBlk_v [out] The vector of the extracted the CRS diagonal blocks + * (1 <= the number of diagonal blocks <= A_nrows) * * Usage Example: * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); @@ -2395,6 +2466,12 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( } } else { // A_nrows >= 1 + if ((n_blocks < 1) || (A_nrows < n_blocks)) { + std::ostringstream os; + os << "The number of diagonal blocks (" << n_blocks << ") should be >=1 and <= the number of rows of the matrix A (" << A_nrows << ")"; + throw std::runtime_error(os.str()); + } + ordinal_type rows_per_block = ((A_nrows % n_blocks) == 0) ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); @@ -2406,82 +2483,43 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( std::vector entries_h_v(n_blocks); std::vector values_h_v(n_blocks); - ordinal_type row_start = 0; // first row index of i-th diagonal block - ordinal_type col_start = 0; // first col index of i-th diagonal block - ordinal_type nrows, ncols; // Nrows, Ncols of i-th diagonal block + ordinal_type blk_row_start = 0; // first row index of i-th diagonal block + ordinal_type blk_col_start = 0; // first col index of i-th diagonal block + ordinal_type blk_nrows, blk_ncols; // Nrows, Ncols of i-th diagonal block for (ordinal_type i = 0; i < n_blocks; i++) { - nrows = rows_per_block; - if ((row_start + rows_per_block) > A_nrows) { - nrows = A_nrows - row_start; - } - col_start = row_start; - ncols = nrows; - - // Rowmap of i-th row-oriented sub-matrix - auto A_row_map_sub = Kokkos::subview( - A_row_map_h, Kokkos::make_pair(row_start, row_start + nrows + 1)); - - // First round: count i-th non-zeros or size of entries_v[i] - size_type n_entries = 0; - offset_view1d_type first("first", nrows); // first position per row - offset_view1d_type last("last", nrows); // last position per row - - for (ordinal_type j = 0; j < nrows; j++) { // loop through each row - size_type k1 = A_row_map_sub(j); - size_type k2 = A_row_map_sub(j + 1); - size_type k; - // Assume column indices are sorted in ascending order - // Find the position of the start column in the row - for (k = k1; k < k2; k++) { - ordinal_type col = A_entries_h(k); - if (col >= col_start) { - break; - } - } - first(j) = k; - // Find the position of the last column in the row - for (k = k2 - 1; k >= k1; k--) { - ordinal_type col = A_entries_h(k); - if (col < col_start + ncols) { - break; - } - } - last(j) = k; - n_entries += (last(j) - first(j) + 1); + blk_nrows = rows_per_block; + if ((blk_row_start + rows_per_block) > A_nrows) { + blk_nrows = A_nrows - blk_row_start; } + blk_col_start = blk_row_start; + blk_ncols = blk_nrows; - // Second round: - // - create row_map_v[i] - // - copy A_entries to entries_v[i] and update entries_v[i] with local - // column indices - // - copy A_values to values_v[i] - row_map_v[i] = out_row_map_type("row_map_v", nrows + 1); - entries_v[i] = out_entries_type("entries_v", n_entries); - values_v[i] = out_values_type("values_v", n_entries); - row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", nrows + 1); - entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", n_entries); - values_h_v[i] = out_values_hostmirror_type("values_h_v", n_entries); - size_type first_ = 0; - for (ordinal_type j = 0; j < nrows; j++) { // loop through each row - size_type nnz = last(j) - first(j) + 1; - row_map_h_v[i](j) = first_; - for (size_type k = 0; k < nnz; k++) { - entries_h_v[i](first_ + k) = A_entries_h(first(j) + k) - col_start; - values_h_v[i](first_ + k) = A_values_h(first(j) + k); - } - first_ += nnz; - } - row_map_h_v[i](nrows) = n_entries; // last element + // First round: count i-th non-zeros or size of entries_v[i] and find the first and last column indices at each row + size_type blk_nnz = 0; + offset_view1d_type first("first", blk_nrows); // first position per row + offset_view1d_type last("last", blk_nrows); // last position per row + + kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential(A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, blk_ncols, blk_nnz, first, last); + + // Second round: extract + row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); + entries_v[i] = out_entries_type("entries_v", blk_nnz); + values_v[i] = out_values_type("values_v", blk_nnz); + row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); + entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz); + values_h_v[i] = out_values_hostmirror_type("values_h_v", blk_nnz); + + kk_extract_subblock_crsmatrix_sequential(A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); Kokkos::deep_copy(entries_v[i], entries_h_v[i]); Kokkos::deep_copy(values_v[i], values_h_v[i]); - DiagBlk_v[i] = crsMat_t("CrsMatrix", nrows, ncols, n_entries, + DiagBlk_v[i] = crsMat_t("CrsMatrix", blk_nrows, blk_ncols, blk_nnz, values_v[i], row_map_v[i], entries_v[i]); - row_start += nrows; + blk_row_start += blk_nrows; } // for (ordinal_type i = 0; i < n_blocks; i++) } // A_nrows >= 1 } // n_blocks > 1 diff --git a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp index 69d8eabb0a..327780dec3 100644 --- a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp +++ b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -135,8 +135,6 @@ void test_extract_diagonal_blocks() { for (int s = 1; s <= 8; s++) { Test::run_test_extract_diagonal_blocks( 0, s); - Test::run_test_extract_diagonal_blocks( - 3, s); Test::run_test_extract_diagonal_blocks( 12, s); Test::run_test_extract_diagonal_blocks( From ac523de1e9c90c2c20ef123c64a168a26e57c77b Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 5 Sep 2023 14:00:36 -0700 Subject: [PATCH 162/231] Clang format --- sparse/src/KokkosSparse_Utils.hpp | 82 ++++++++++++++++++------------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 7034b50ae2..33d9d6806a 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -2331,21 +2331,26 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, } /** - * @brief Count the non-zeros of a sub-block in a CRS matrix and find the first and last column indices at each row of the sub-block - * This is a host function used by the kk_extract_diagonal_blocks_crsmatrix_sequential() + * @brief Count the non-zeros of a sub-block in a CRS matrix and find the first + * and last column indices at each row of the sub-block. This is a host function + * used by the kk_extract_diagonal_blocks_crsmatrix_sequential() */ -template -void kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential(const row_map_type &A_row_map,const entries_type &A_entries, const ordinal_type &blk_row_start, const ordinal_type &blk_col_start, const ordinal_type &blk_nrows, const ordinal_type &blk_ncols, size_type &blk_nnz, offset_view1d_type &first_indices, offset_view1d_type &last_indices) { +template +void kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( + const row_map_type &A_row_map, const entries_type &A_entries, + const ordinal_type &blk_row_start, const ordinal_type &blk_col_start, + const ordinal_type &blk_nrows, const ordinal_type &blk_ncols, + size_type &blk_nnz, offset_view1d_type &first_indices, + offset_view1d_type &last_indices) { // Rowmap of i-th row-oriented sub-matrix - auto A_row_map_sub = Kokkos::subview(A_row_map, Kokkos::make_pair(blk_row_start, blk_row_start + blk_nrows + 1)); + auto A_row_map_sub = Kokkos::subview( + A_row_map, + Kokkos::make_pair(blk_row_start, blk_row_start + blk_nrows + 1)); blk_nnz = 0; - for (ordinal_type j = 0; j < blk_nrows; j++) { // loop through each row + for (ordinal_type j = 0; j < blk_nrows; j++) { // loop through each row size_type k1 = A_row_map_sub(j); size_type k2 = A_row_map_sub(j + 1); size_type k; @@ -2372,19 +2377,22 @@ void kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential(const row_map_ /** * @brief Extract a CRS sub-block from a CRS matrix - * This is a host function used by the kk_extract_diagonal_blocks_crsmatrix_sequential() + * This is a host function used by the + * kk_extract_diagonal_blocks_crsmatrix_sequential() */ -template -void kk_extract_subblock_crsmatrix_sequential(const entries_type &A_entries, const values_type &A_values, const ordinal_type &blk_col_start, const ordinal_type &blk_nrows, const size_type &blk_nnz, const offset_view1d_type &first_indices, const offset_view1d_type &last_indices, out_row_map_type &blk_row_map, out_entries_type &blk_entries, out_values_type &blk_values) { +void kk_extract_subblock_crsmatrix_sequential( + const entries_type &A_entries, const values_type &A_values, + const ordinal_type &blk_col_start, const ordinal_type &blk_nrows, + const size_type &blk_nnz, const offset_view1d_type &first_indices, + const offset_view1d_type &last_indices, out_row_map_type &blk_row_map, + out_entries_type &blk_entries, out_values_type &blk_values) { // - create out_row_map - // - copy A_entries to out_entries and update out_entries with local column indices + // - copy A_entries to out_entries and update out_entries with local column + // indices // - copy A_values to out_values size_type first_ = 0; for (ordinal_type j = 0; j < blk_nrows; j++) { // loop through each row @@ -2468,7 +2476,9 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( // A_nrows >= 1 if ((n_blocks < 1) || (A_nrows < n_blocks)) { std::ostringstream os; - os << "The number of diagonal blocks (" << n_blocks << ") should be >=1 and <= the number of rows of the matrix A (" << A_nrows << ")"; + os << "The number of diagonal blocks (" << n_blocks + << ") should be >=1 and <= the number of rows of the matrix A (" + << A_nrows << ")"; throw std::runtime_error(os.str()); } @@ -2483,9 +2493,9 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( std::vector entries_h_v(n_blocks); std::vector values_h_v(n_blocks); - ordinal_type blk_row_start = 0; // first row index of i-th diagonal block - ordinal_type blk_col_start = 0; // first col index of i-th diagonal block - ordinal_type blk_nrows, blk_ncols; // Nrows, Ncols of i-th diagonal block + ordinal_type blk_row_start = 0; // first row index of i-th diagonal block + ordinal_type blk_col_start = 0; // first col index of i-th diagonal block + ordinal_type blk_nrows, blk_ncols; // Nrows, Ncols of i-th diagonal block for (ordinal_type i = 0; i < n_blocks; i++) { blk_nrows = rows_per_block; @@ -2495,22 +2505,28 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( blk_col_start = blk_row_start; blk_ncols = blk_nrows; - // First round: count i-th non-zeros or size of entries_v[i] and find the first and last column indices at each row + // First round: count i-th non-zeros or size of entries_v[i] and find + // the first and last column indices at each row size_type blk_nnz = 0; - offset_view1d_type first("first", blk_nrows); // first position per row - offset_view1d_type last("last", blk_nrows); // last position per row + offset_view1d_type first("first", blk_nrows); // first position per row + offset_view1d_type last("last", blk_nrows); // last position per row - kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential(A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, blk_ncols, blk_nnz, first, last); + kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( + A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, + blk_ncols, blk_nnz, first, last); // Second round: extract - row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); - entries_v[i] = out_entries_type("entries_v", blk_nnz); - values_v[i] = out_values_type("values_v", blk_nnz); - row_map_h_v[i] = out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); + row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); + entries_v[i] = out_entries_type("entries_v", blk_nnz); + values_v[i] = out_values_type("values_v", blk_nnz); + row_map_h_v[i] = + out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz); values_h_v[i] = out_values_hostmirror_type("values_h_v", blk_nnz); - kk_extract_subblock_crsmatrix_sequential(A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); + kk_extract_subblock_crsmatrix_sequential( + A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, + last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); Kokkos::deep_copy(entries_v[i], entries_h_v[i]); From aa70eb9526988a1da2a70debbab39360ed5d7419 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 28 Aug 2023 12:18:53 -0600 Subject: [PATCH 163/231] SPMV_Struct_Functor: initialize numExterior --- sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index dc3c592632..c18018f54f 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -142,6 +142,7 @@ struct SPMV_Struct_Functor { beta(beta_), m_y(m_y_), stencil_type(stencil_type_), + numExterior(0), rows_per_team(rows_per_team_), rows_per_team_ext(rows_per_team_ext_) { static_assert(static_cast(XVector::rank) == 1, From f2909851f27dd87b0476e5231c57d96893881d76 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 6 Sep 2023 09:15:53 -0600 Subject: [PATCH 164/231] perf_test/sparse: Update GS perf_test for streams --- perf_test/sparse/KokkosSparse_gs.cpp | 235 ++++++++++++++++++--------- 1 file changed, 159 insertions(+), 76 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 2a8b164219..52fe9182ab 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -53,6 +53,7 @@ struct GS_Parameters { int maxNnzPerLongRow = 2000; bool graph_symmetric = false; int sweeps = 1; + int nstreams = 1; GSAlgorithm algo = GS_DEFAULT; GSDirection direction = GS_FORWARD; // Point: @@ -187,92 +188,170 @@ void runGS(const GS_Parameters& params) { Kokkos::finalize(); exit(1); } + std::vector instances; // size_type nnz = A.nnz(); - KernelHandle kh; + std::vector kh(params.nstreams); // use a random RHS - uniformly distributed over (-5, 5) - scalar_view_t b("b", nrows); - { - srand(54321); - auto bhost = Kokkos::create_mirror_view(b); - for (lno_t i = 0; i < nrows; i++) { - bhost(i) = 10.0 * rand() / RAND_MAX - 5.0; - } - Kokkos::deep_copy(b, bhost); - } - double bnorm = KokkosBlas::nrm2(b); + std::vector b(params.nstreams); // initial LHS is 0 - scalar_view_t x("x", nrows); + std::vector x(params.nstreams); + // Extract diagonal blocks of CRS matrix + std::vector DiagBlks(params.nstreams); // how long symbolic/numeric phases take (the graph reuse case isn't that // interesting since numeric doesn't do much) Kokkos::Timer timer; - // cluster size of 1 is standard multicolor GS - if (params.algo == GS_DEFAULT) { - kh.create_gs_handle(); - kh.get_point_gs_handle()->set_long_row_threshold(params.longRowThreshold); - } else if (params.algo == GS_CLUSTER) { - kh.create_gs_handle(params.coarse_algo, params.cluster_size); - } else { - kh.create_gs_handle(params.algo); - if (params.algo == GS_TWOSTAGE) kh.set_gs_twostage(!params.classic, nrows); - } - timer.reset(); - KokkosSparse::Experimental::gauss_seidel_symbolic( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, - params.graph_symmetric); - double symbolicLaunchTime = timer.seconds(); - timer.reset(); - Kokkos::fence(); - double symbolicComputeTime = timer.seconds(); - timer.reset(); - KokkosSparse::Experimental::gauss_seidel_numeric( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, - params.graph_symmetric); - double numericLaunchTime = timer.seconds(); - timer.reset(); - Kokkos::fence(); - double numericComputeTime = timer.seconds(); - timer.reset(); - // Last two parameters are damping factor (should be 1) and sweeps - switch (params.direction) { - case GS_SYMMETRIC: - KokkosSparse::Experimental::symmetric_gauss_seidel_apply( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, - true, true, 1.0, params.sweeps); - break; - case GS_FORWARD: - KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, - true, true, 1.0, params.sweeps); - break; - case GS_BACKWARD: - KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, - true, true, 1.0, params.sweeps); - break; + + { + namespace KE = Kokkos::Experimental; + auto ns = params.nstreams; + auto es = exec_space(); + if (ns == 1) + instances = KE::partition_space(es, 1); + else if (ns == 2) + instances = KE::partition_space(es, 1, 1); + else if (ns == 3) + instances = KE::partition_space(es, 1, 1, 1); + else if (ns == 4) + instances = KE::partition_space(es, 1, 1, 1, 1); + else if (ns == 5) + instances = KE::partition_space(es, 1, 1, 1, 1, 1); + else if (ns == 6) + instances = KE::partition_space(es, 1, 1, 1, 1, 1, 1); + else if (ns == 7) + instances = KE::partition_space(es, 1, 1, 1, 1, 1, 1, 1); + else if (ns == 8) + instances = KE::partition_space(es, 1, 1, 1, 1, 1, 1, 1, 1); + else + Kokkos::abort("--streams outside of [1, 8] is not supported."); } - double applyLaunchTime = timer.seconds(); + double blockExtractionTime = 0, symbolicLaunchTimeTotal = 0, + symbolicComputeTimeTotal = 0, numericLaunchTimeTotal = 0, + numericComputeTimeTotal = 0, applyLaunchTimeTotal = 0, + applyComputeTimeTotal = 0; + timer.reset(); + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, + DiagBlks); Kokkos::fence(); - double applyComputeTime = timer.seconds(); - timer.reset(); - kh.destroy_gs_handle(); - // Now, compute the 2-norm of residual - scalar_view_t res("Ax-b", nrows); - Kokkos::deep_copy(res, b); - scalar_t alpha = Kokkos::reduction_identity::prod(); - scalar_t beta = -alpha; - KokkosSparse::spmv("N", alpha, A, x, beta, res); - double resnorm = KokkosBlas::nrm2(res); - std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; - std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; - std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; - std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; - std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; - std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; - // note: this still works if the solution diverges - std::cout << "Relative res norm: " << resnorm / bnorm << '\n'; + blockExtractionTime = timer.seconds(); + + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); + auto blk_ncols = blk_A.numCols(); + if (blk_nrows != blk_ncols) { + cout << "ERROR: Gauss-Seidel only works for square matrices\n"; + Kokkos::finalize(); + exit(1); + } + b[i] = scalar_view_t("b[" + std::to_string(i) + "]", blk_nrows); + x[i] = scalar_view_t("x[" + std::to_string(i) + "]", blk_nrows); + { + srand(54321); + auto bhost = Kokkos::create_mirror_view(b[i]); + for (lno_t row_id = 0; row_id < blk_nrows; row_id++) { + bhost(row_id) = 10.0 * rand() / RAND_MAX - 5.0; + } + Kokkos::deep_copy(instances[i], b[i], bhost); + } + double bnorm = KokkosBlas::nrm2(instances[i], b[i]); + // cluster size of 1 is standard multicolor GS + if (params.algo == GS_DEFAULT) { + kh[i].create_gs_handle(instances[i], params.nstreams); + kh[i].get_point_gs_handle()->set_long_row_threshold( + params.longRowThreshold); + } else if (params.algo == GS_CLUSTER) { + kh[i].create_gs_handle(params.coarse_algo, params.cluster_size); + } else { + kh[i].create_gs_handle(params.algo); + if (params.algo == GS_TWOSTAGE) + kh[i].set_gs_twostage(!params.classic, blk_nrows); + } + timer.reset(); + KokkosSparse::Experimental::gauss_seidel_symbolic( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, params.graph_symmetric); + double symbolicLaunchTime = timer.seconds(); + timer.reset(); + Kokkos::fence(); + double symbolicComputeTime = timer.seconds(); + timer.reset(); + KokkosSparse::Experimental::gauss_seidel_numeric( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, params.graph_symmetric); + double numericLaunchTime = timer.seconds(); + timer.reset(); + Kokkos::fence(); + double numericComputeTime = timer.seconds(); + timer.reset(); + // Last two parameters are damping factor (should be 1) and sweeps + switch (params.direction) { + case GS_SYMMETRIC: + KokkosSparse::Experimental::symmetric_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); + break; + case GS_FORWARD: + KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); + break; + case GS_BACKWARD: + KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); + break; + } + + double applyLaunchTime = timer.seconds(); + timer.reset(); + Kokkos::fence(); + double applyComputeTime = timer.seconds(); + timer.reset(); + kh[i].destroy_gs_handle(); + // Now, compute the 2-norm of residual + scalar_view_t res("Ax-b", blk_nrows); + Kokkos::deep_copy(instances[i], res, b[i]); + scalar_t alpha = Kokkos::reduction_identity::prod(); + scalar_t beta = -alpha; + KokkosSparse::spmv(instances[i], "N", alpha, blk_A, x[i], + beta, res); + double resnorm = KokkosBlas::nrm2(instances[i], res); + std::cout << "\n***Stream ID: " << i << std::endl; + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; + // note: this still works if the solution diverges + std::cout << "Relative res norm: " << resnorm / bnorm << '\n'; + symbolicLaunchTimeTotal += symbolicLaunchTime; + symbolicComputeTimeTotal += symbolicComputeTime; + numericLaunchTimeTotal += numericLaunchTime; + numericComputeTimeTotal += numericComputeTime; + applyLaunchTimeTotal += applyLaunchTime; + applyComputeTimeTotal += applyComputeTime; + } + std::cout << "\n\n\n*** Total block extraction time: " << blockExtractionTime + << '\n'; + std::cout << "\n*** Total Symbolic launch time: " << symbolicLaunchTimeTotal + << '\n'; + std::cout << "\n*** Total Symbolic compute time: " << symbolicComputeTimeTotal + << '\n'; + std::cout << "\n*** Total Numeric launch time: " << numericLaunchTimeTotal + << '\n'; + std::cout << "\n*** Total Numeric compute time: " << numericComputeTimeTotal + << '\n'; + std::cout << "\n*** Total Apply launch time: " << applyLaunchTimeTotal + << '\n'; + std::cout << "\n*** Total Apply compute time: " << applyComputeTimeTotal + << '\n'; } int main(int argc, char** argv) { @@ -288,6 +367,8 @@ int main(int argc, char** argv) { "symmetric.\n"; cout << " : if generating matrix randomly, it is symmetrized\n"; cout << "--sweeps S: run S times (default 1)\n"; + cout << "--streams N: partition matrix and run across N streams (default " + "1)\n"; cout << "Randomized matrix settings, if not reading from file:\n"; cout << " --n : number of rows/columns\n"; cout << " --nnz : number of nonzeros in each regular row\n"; @@ -354,6 +435,8 @@ int main(int argc, char** argv) { params.direction = GS_BACKWARD; else if (!strcmp(argv[i], "--sweeps")) params.sweeps = atoi(getNextArg(i, argc, argv)); + else if (!strcmp(argv[i], "--streams")) + params.nstreams = atoi(getNextArg(i, argc, argv)); else if (!strcmp(argv[i], "--point")) params.algo = GS_DEFAULT; else if (!strcmp(argv[i], "--cluster")) From 17cc1b4ce565693cf946f8bc17e99534281a2ac2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 6 Sep 2023 10:24:55 -0600 Subject: [PATCH 165/231] perf_test/sparse: Refactored for timing non-blocking apply without fences --- perf_test/sparse/KokkosSparse_gs.cpp | 48 ++++++++++++++++------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 52fe9182ab..c3101c7eee 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -255,7 +255,6 @@ void runGS(const GS_Parameters& params) { } Kokkos::deep_copy(instances[i], b[i], bhost); } - double bnorm = KokkosBlas::nrm2(instances[i], b[i]); // cluster size of 1 is standard multicolor GS if (params.algo == GS_DEFAULT) { kh[i].create_gs_handle(instances[i], params.nstreams); @@ -284,7 +283,21 @@ void runGS(const GS_Parameters& params) { timer.reset(); Kokkos::fence(); double numericComputeTime = timer.seconds(); - timer.reset(); + std::cout << "\n***Stream ID: " << i << std::endl; + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; + symbolicLaunchTimeTotal += symbolicLaunchTime; + symbolicComputeTimeTotal += symbolicComputeTime; + numericLaunchTimeTotal += numericLaunchTime; + numericComputeTimeTotal += numericComputeTime; + } + + timer.reset(); + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { case GS_SYMMETRIC: @@ -306,37 +319,30 @@ void runGS(const GS_Parameters& params) { params.sweeps); break; } + } + applyLaunchTimeTotal = timer.seconds(); + timer.reset(); + Kokkos::fence(); + applyComputeTimeTotal = timer.seconds(); + timer.reset(); - double applyLaunchTime = timer.seconds(); - timer.reset(); - Kokkos::fence(); - double applyComputeTime = timer.seconds(); - timer.reset(); + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); kh[i].destroy_gs_handle(); // Now, compute the 2-norm of residual scalar_view_t res("Ax-b", blk_nrows); Kokkos::deep_copy(instances[i], res, b[i]); + double bnorm = KokkosBlas::nrm2(instances[i], b[i]); scalar_t alpha = Kokkos::reduction_identity::prod(); scalar_t beta = -alpha; KokkosSparse::spmv(instances[i], "N", alpha, blk_A, x[i], beta, res); double resnorm = KokkosBlas::nrm2(instances[i], res); - std::cout << "\n***Stream ID: " << i << std::endl; - std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; - std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; - std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; - std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; - std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; - std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; // note: this still works if the solution diverges - std::cout << "Relative res norm: " << resnorm / bnorm << '\n'; - symbolicLaunchTimeTotal += symbolicLaunchTime; - symbolicComputeTimeTotal += symbolicComputeTime; - numericLaunchTimeTotal += numericLaunchTime; - numericComputeTimeTotal += numericComputeTime; - applyLaunchTimeTotal += applyLaunchTime; - applyComputeTimeTotal += applyComputeTime; + std::cout << "StreamID(" << i << "): Relative res norm: " << resnorm / bnorm + << '\n'; } std::cout << "\n\n\n*** Total block extraction time: " << blockExtractionTime << '\n'; From 552eb9368ca324233c5d393493160affa9ea7bc1 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Wed, 6 Sep 2023 10:33:01 -0600 Subject: [PATCH 166/231] Update perf_test/sparse/KokkosSparse_gs.cpp Co-authored-by: brian-kelley --- perf_test/sparse/KokkosSparse_gs.cpp | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index c3101c7eee..4397dd8c41 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -205,24 +205,9 @@ void runGS(const GS_Parameters& params) { namespace KE = Kokkos::Experimental; auto ns = params.nstreams; auto es = exec_space(); - if (ns == 1) - instances = KE::partition_space(es, 1); - else if (ns == 2) - instances = KE::partition_space(es, 1, 1); - else if (ns == 3) - instances = KE::partition_space(es, 1, 1, 1); - else if (ns == 4) - instances = KE::partition_space(es, 1, 1, 1, 1); - else if (ns == 5) - instances = KE::partition_space(es, 1, 1, 1, 1, 1); - else if (ns == 6) - instances = KE::partition_space(es, 1, 1, 1, 1, 1, 1); - else if (ns == 7) - instances = KE::partition_space(es, 1, 1, 1, 1, 1, 1, 1); - else if (ns == 8) - instances = KE::partition_space(es, 1, 1, 1, 1, 1, 1, 1, 1); - else - Kokkos::abort("--streams outside of [1, 8] is not supported."); + std::vector weights(ns); + std::fill(weights.begin(), weights.end(), 1); + instances = KE::partition_space(es, weights); } double blockExtractionTime = 0, symbolicLaunchTimeTotal = 0, From a8fd7a49d03ec549eaa22f4a869b4110e03e4280 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Wed, 6 Sep 2023 10:33:08 -0600 Subject: [PATCH 167/231] Update perf_test/sparse/KokkosSparse_gs.cpp Co-authored-by: brian-kelley --- perf_test/sparse/KokkosSparse_gs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 4397dd8c41..857f092561 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -233,7 +233,7 @@ void runGS(const GS_Parameters& params) { b[i] = scalar_view_t("b[" + std::to_string(i) + "]", blk_nrows); x[i] = scalar_view_t("x[" + std::to_string(i) + "]", blk_nrows); { - srand(54321); + srand(54321 + i); auto bhost = Kokkos::create_mirror_view(b[i]); for (lno_t row_id = 0; row_id < blk_nrows; row_id++) { bhost(row_id) = 10.0 * rand() / RAND_MAX - 5.0; From 7548ec2b5a66efadddd941aaeb50f76c424c8238 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 6 Sep 2023 10:41:00 -0600 Subject: [PATCH 168/231] perf_test/sparse: Re-work numeric and symbolic timing --- perf_test/sparse/KokkosSparse_gs.cpp | 45 +++++++++++++++------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 857f092561..9dc1b0b150 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -221,6 +221,7 @@ void runGS(const GS_Parameters& params) { Kokkos::fence(); blockExtractionTime = timer.seconds(); + /////////////////// Handle creation /////////////////// for (int i = 0; i < params.nstreams; i++) { auto blk_A = DiagBlks[i]; auto blk_nrows = blk_A.numRows(); @@ -252,33 +253,37 @@ void runGS(const GS_Parameters& params) { if (params.algo == GS_TWOSTAGE) kh[i].set_gs_twostage(!params.classic, blk_nrows); } - timer.reset(); + } + + /////////////////// Symbolic ///////////////// + timer.reset(); + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); KokkosSparse::Experimental::gauss_seidel_symbolic( instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, blk_A.graph.entries, params.graph_symmetric); - double symbolicLaunchTime = timer.seconds(); - timer.reset(); - Kokkos::fence(); - double symbolicComputeTime = timer.seconds(); - timer.reset(); + } + symbolicLaunchTimeTotal = timer.seconds(); + timer.reset(); + Kokkos::fence(); + symbolicComputeTimeTotal = timer.seconds(); + + /////////////////// Numeric ///////////////// + timer.reset(); + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); KokkosSparse::Experimental::gauss_seidel_numeric( instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, blk_A.graph.entries, blk_A.values, params.graph_symmetric); - double numericLaunchTime = timer.seconds(); - timer.reset(); - Kokkos::fence(); - double numericComputeTime = timer.seconds(); - std::cout << "\n***Stream ID: " << i << std::endl; - std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; - std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; - std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; - std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; - symbolicLaunchTimeTotal += symbolicLaunchTime; - symbolicComputeTimeTotal += symbolicComputeTime; - numericLaunchTimeTotal += numericLaunchTime; - numericComputeTimeTotal += numericComputeTime; } + numericLaunchTimeTotal = timer.seconds(); + timer.reset(); + Kokkos::fence(); + numericComputeTimeTotal = timer.seconds(); + /////////////////// Apply ///////////////// timer.reset(); for (int i = 0; i < params.nstreams; i++) { auto blk_A = DiagBlks[i]; @@ -329,7 +334,7 @@ void runGS(const GS_Parameters& params) { std::cout << "StreamID(" << i << "): Relative res norm: " << resnorm / bnorm << '\n'; } - std::cout << "\n\n\n*** Total block extraction time: " << blockExtractionTime + std::cout << "\n*** Total block extraction time: " << blockExtractionTime << '\n'; std::cout << "\n*** Total Symbolic launch time: " << symbolicLaunchTimeTotal << '\n'; From 35d8ff92b320c27fb36ce2875187db6d3ef49c92 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 6 Sep 2023 10:58:22 -0600 Subject: [PATCH 169/231] perf_test/sparse: Improve time reporting --- perf_test/sparse/KokkosSparse_gs.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 9dc1b0b150..163fdb2dd1 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -338,16 +338,24 @@ void runGS(const GS_Parameters& params) { << '\n'; std::cout << "\n*** Total Symbolic launch time: " << symbolicLaunchTimeTotal << '\n'; - std::cout << "\n*** Total Symbolic compute time: " << symbolicComputeTimeTotal + std::cout << "*** Total Symbolic compute time: " << symbolicComputeTimeTotal << '\n'; std::cout << "\n*** Total Numeric launch time: " << numericLaunchTimeTotal << '\n'; - std::cout << "\n*** Total Numeric compute time: " << numericComputeTimeTotal + std::cout << "*** Total Numeric compute time: " << numericComputeTimeTotal << '\n'; std::cout << "\n*** Total Apply launch time: " << applyLaunchTimeTotal << '\n'; - std::cout << "\n*** Total Apply compute time: " << applyComputeTimeTotal + std::cout << "*** Total Apply compute time: " << applyComputeTimeTotal << '\n'; + double launchTimeTotal = + symbolicLaunchTimeTotal + numericLaunchTimeTotal + applyLaunchTimeTotal; + std::cout << "\n*** Total launch time: " << launchTimeTotal << '\n'; + double computeTimeTotal = symbolicComputeTimeTotal + numericComputeTimeTotal + + applyComputeTimeTotal; + std::cout << "*** Total compute time: " << computeTimeTotal << '\n'; + std::cout << "\n*** Total compute and launch time: " + << launchTimeTotal + computeTimeTotal << '\n'; } int main(int argc, char** argv) { From be40ee6fcdb04a92e22fac116c6d398f80cb2dee Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 6 Sep 2023 14:59:05 -0600 Subject: [PATCH 170/231] common/src: Remove fence from zero_vector --- common/src/KokkosKernels_Utils.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index c6780185a4..f6861d8904 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -899,7 +899,6 @@ void zero_vector(ExecSpaceIn &exec_space_in, typedef typename value_array_type::non_const_value_type val_type; Kokkos::deep_copy(exec_space_in, vector, Kokkos::ArithTraits::zero()); - exec_space_in.fence(); } template From 409c691fb88f20bed3d5fa08727eb2cfef8bb6ee Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 6 Sep 2023 15:10:58 -0600 Subject: [PATCH 171/231] Use std::thread for apply --- perf_test/sparse/KokkosSparse_gs.cpp | 58 +++++++++++++++------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 163fdb2dd1..a8632691f0 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -30,6 +30,7 @@ #include #include #include +#include using std::cout; using std::string; @@ -175,6 +176,7 @@ void runGS(const GS_Parameters& params) { crsMat_t; // typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + namespace KSE = KokkosSparse::Experimental; crsMat_t A; if (params.matrix_path) A = KokkosSparse::Impl::read_kokkos_crst_matrix( @@ -260,9 +262,9 @@ void runGS(const GS_Parameters& params) { for (int i = 0; i < params.nstreams; i++) { auto blk_A = DiagBlks[i]; auto blk_nrows = blk_A.numRows(); - KokkosSparse::Experimental::gauss_seidel_symbolic( - instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, - blk_A.graph.entries, params.graph_symmetric); + KSE::gauss_seidel_symbolic(instances[i], &kh[i], blk_nrows, blk_nrows, + blk_A.graph.row_map, blk_A.graph.entries, + params.graph_symmetric); } symbolicLaunchTimeTotal = timer.seconds(); timer.reset(); @@ -274,42 +276,46 @@ void runGS(const GS_Parameters& params) { for (int i = 0; i < params.nstreams; i++) { auto blk_A = DiagBlks[i]; auto blk_nrows = blk_A.numRows(); - KokkosSparse::Experimental::gauss_seidel_numeric( - instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, - blk_A.graph.entries, blk_A.values, params.graph_symmetric); + KSE::gauss_seidel_numeric(instances[i], &kh[i], blk_nrows, blk_nrows, + blk_A.graph.row_map, blk_A.graph.entries, + blk_A.values, params.graph_symmetric); } numericLaunchTimeTotal = timer.seconds(); timer.reset(); Kokkos::fence(); numericComputeTimeTotal = timer.seconds(); - /////////////////// Apply ///////////////// - timer.reset(); - for (int i = 0; i < params.nstreams; i++) { - auto blk_A = DiagBlks[i]; - auto blk_nrows = blk_A.numRows(); + /////////////////// Apply /////////////// + // NOTE: You cannot use capture by value in the 'apply' lambda since 'kh' has + // no copy constructor. + auto apply = [&](const int i) { // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { case GS_SYMMETRIC: - KokkosSparse::Experimental::symmetric_gauss_seidel_apply( - instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, - blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, - params.sweeps); + KSE::symmetric_gauss_seidel_apply( + instances[i], &kh[i], DiagBlks[i].numRows(), DiagBlks[i].numRows(), + DiagBlks[i].graph.row_map, DiagBlks[i].graph.entries, + DiagBlks[i].values, x[i], b[i], true, true, 1.0, params.sweeps); break; case GS_FORWARD: - KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( - instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, - blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, - params.sweeps); + KSE::forward_sweep_gauss_seidel_apply( + instances[i], &kh[i], DiagBlks[i].numRows(), DiagBlks[i].numRows(), + DiagBlks[i].graph.row_map, DiagBlks[i].graph.entries, + DiagBlks[i].values, x[i], b[i], true, true, 1.0, params.sweeps); break; case GS_BACKWARD: - KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply( - instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, - blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, - params.sweeps); + KSE::backward_sweep_gauss_seidel_apply( + instances[i], &kh[i], DiagBlks[i].numRows(), DiagBlks[i].numRows(), + DiagBlks[i].graph.row_map, DiagBlks[i].graph.entries, + DiagBlks[i].values, x[i], b[i], true, true, 1.0, params.sweeps); break; } - } + }; + std::vector apply_thread(params.nstreams); + timer.reset(); + for (int i = 0; i < params.nstreams; i++) + apply_thread[i] = std::thread(apply, i); + for (int i = 0; i < params.nstreams; i++) apply_thread[i].join(); applyLaunchTimeTotal = timer.seconds(); timer.reset(); Kokkos::fence(); @@ -326,9 +332,7 @@ void runGS(const GS_Parameters& params) { double bnorm = KokkosBlas::nrm2(instances[i], b[i]); scalar_t alpha = Kokkos::reduction_identity::prod(); scalar_t beta = -alpha; - KokkosSparse::spmv(instances[i], "N", alpha, blk_A, x[i], - beta, res); + KokkosSparse::spmv(instances[i], "N", alpha, blk_A, x[i], beta, res); double resnorm = KokkosBlas::nrm2(instances[i], res); // note: this still works if the solution diverges std::cout << "StreamID(" << i << "): Relative res norm: " << resnorm / bnorm From 84ccefb7c8149e6c92049865cf9d99bafb114e56 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 7 Sep 2023 08:11:21 -0600 Subject: [PATCH 172/231] Revert "common/src: Remove fence from zero_vector" This reverts commit be40ee6fcdb04a92e22fac116c6d398f80cb2dee. --- common/src/KokkosKernels_Utils.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index f6861d8904..c6780185a4 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -899,6 +899,7 @@ void zero_vector(ExecSpaceIn &exec_space_in, typedef typename value_array_type::non_const_value_type val_type; Kokkos::deep_copy(exec_space_in, vector, Kokkos::ArithTraits::zero()); + exec_space_in.fence(); } template From 387b17cc266b712661752362ae795ed1612ef1ae Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 7 Sep 2023 08:39:52 -0600 Subject: [PATCH 173/231] Revert "Use std::thread for apply" This reverts commit 409c691fb88f20bed3d5fa08727eb2cfef8bb6ee. --- perf_test/sparse/KokkosSparse_gs.cpp | 58 +++++++++++++--------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index a8632691f0..163fdb2dd1 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -30,7 +30,6 @@ #include #include #include -#include using std::cout; using std::string; @@ -176,7 +175,6 @@ void runGS(const GS_Parameters& params) { crsMat_t; // typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - namespace KSE = KokkosSparse::Experimental; crsMat_t A; if (params.matrix_path) A = KokkosSparse::Impl::read_kokkos_crst_matrix( @@ -262,9 +260,9 @@ void runGS(const GS_Parameters& params) { for (int i = 0; i < params.nstreams; i++) { auto blk_A = DiagBlks[i]; auto blk_nrows = blk_A.numRows(); - KSE::gauss_seidel_symbolic(instances[i], &kh[i], blk_nrows, blk_nrows, - blk_A.graph.row_map, blk_A.graph.entries, - params.graph_symmetric); + KokkosSparse::Experimental::gauss_seidel_symbolic( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, params.graph_symmetric); } symbolicLaunchTimeTotal = timer.seconds(); timer.reset(); @@ -276,46 +274,42 @@ void runGS(const GS_Parameters& params) { for (int i = 0; i < params.nstreams; i++) { auto blk_A = DiagBlks[i]; auto blk_nrows = blk_A.numRows(); - KSE::gauss_seidel_numeric(instances[i], &kh[i], blk_nrows, blk_nrows, - blk_A.graph.row_map, blk_A.graph.entries, - blk_A.values, params.graph_symmetric); + KokkosSparse::Experimental::gauss_seidel_numeric( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, params.graph_symmetric); } numericLaunchTimeTotal = timer.seconds(); timer.reset(); Kokkos::fence(); numericComputeTimeTotal = timer.seconds(); - /////////////////// Apply /////////////// - // NOTE: You cannot use capture by value in the 'apply' lambda since 'kh' has - // no copy constructor. - auto apply = [&](const int i) { + /////////////////// Apply ///////////////// + timer.reset(); + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { case GS_SYMMETRIC: - KSE::symmetric_gauss_seidel_apply( - instances[i], &kh[i], DiagBlks[i].numRows(), DiagBlks[i].numRows(), - DiagBlks[i].graph.row_map, DiagBlks[i].graph.entries, - DiagBlks[i].values, x[i], b[i], true, true, 1.0, params.sweeps); + KokkosSparse::Experimental::symmetric_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); break; case GS_FORWARD: - KSE::forward_sweep_gauss_seidel_apply( - instances[i], &kh[i], DiagBlks[i].numRows(), DiagBlks[i].numRows(), - DiagBlks[i].graph.row_map, DiagBlks[i].graph.entries, - DiagBlks[i].values, x[i], b[i], true, true, 1.0, params.sweeps); + KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); break; case GS_BACKWARD: - KSE::backward_sweep_gauss_seidel_apply( - instances[i], &kh[i], DiagBlks[i].numRows(), DiagBlks[i].numRows(), - DiagBlks[i].graph.row_map, DiagBlks[i].graph.entries, - DiagBlks[i].values, x[i], b[i], true, true, 1.0, params.sweeps); + KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); break; } - }; - std::vector apply_thread(params.nstreams); - timer.reset(); - for (int i = 0; i < params.nstreams; i++) - apply_thread[i] = std::thread(apply, i); - for (int i = 0; i < params.nstreams; i++) apply_thread[i].join(); + } applyLaunchTimeTotal = timer.seconds(); timer.reset(); Kokkos::fence(); @@ -332,7 +326,9 @@ void runGS(const GS_Parameters& params) { double bnorm = KokkosBlas::nrm2(instances[i], b[i]); scalar_t alpha = Kokkos::reduction_identity::prod(); scalar_t beta = -alpha; - KokkosSparse::spmv(instances[i], "N", alpha, blk_A, x[i], beta, res); + KokkosSparse::spmv(instances[i], "N", alpha, blk_A, x[i], + beta, res); double resnorm = KokkosBlas::nrm2(instances[i], res); // note: this still works if the solution diverges std::cout << "StreamID(" << i << "): Relative res norm: " << resnorm / bnorm From 5cce74d92a1426963f32514c7f6d0697ccfbd883 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 7 Sep 2023 16:57:53 -0600 Subject: [PATCH 174/231] Fix sort_and_merge functions for in-place case sort_and_merge_graph, sort_and_merge_matrix produced incorrect output if any input view (rowptrs, entries, values) was the same object as the corresponding output view. Fix this and add testing that catches the bug. --- sparse/src/KokkosSparse_SortCrs.hpp | 15 ++- sparse/unit_test/Test_Sparse_SortCrs.hpp | 134 +++++++++++++++++------ 2 files changed, 114 insertions(+), 35 deletions(-) diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 31b835d358..107923797a 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -627,6 +627,12 @@ void sort_and_merge_matrix(const exec_space& exec, values_out = values_in; return; } + // Have to do the compression. Create a _shallow_ copy of the input + // to preserve it, in case the input and output views are identical + // references. + auto rowmap_orig = rowmap_in; + auto entries_orig = entries_in; + auto values_orig = values_in; // Prefix sum to get rowmap KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( @@ -642,7 +648,7 @@ void sort_and_merge_matrix(const exec_space& exec, Kokkos::parallel_for( range_t(exec, 0, numRows), Impl::MatrixMergedEntriesFunctor( - rowmap_in, entries_in, values_in, rowmap_out, entries_out, + rowmap_orig, entries_orig, values_orig, rowmap_out, entries_out, values_out)); } @@ -746,6 +752,11 @@ void sort_and_merge_graph(const exec_space& exec, entries_out = entries_in; return; } + // Have to do the compression. Create a _shallow_ copy of the input + // to preserve it, in case the input and output views are identical + // references. + auto rowmap_orig = rowmap_in; + auto entries_orig = entries_in; // Prefix sum to get rowmap. // In the case where the output rowmap is the same as the input, we could just // assign "rowmap_out = rowmap_in" except that would break const-correctness. @@ -760,7 +771,7 @@ void sort_and_merge_graph(const exec_space& exec, // Compute merged entries and values Kokkos::parallel_for(range_t(exec, 0, numRows), Impl::GraphMergedEntriesFunctor( - rowmap_in, entries_in, rowmap_out, entries_out)); + rowmap_orig, entries_orig, rowmap_out, entries_out)); } template diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 63c977ca9a..6cf989accf 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -209,7 +209,7 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { template void testSortAndMerge(bool justGraph, int howExecSpecified, - bool doStructInterface, int testCase) { + bool doStructInterface, bool inPlace, int testCase) { using size_type = default_size_type; using lno_t = default_lno_t; using scalar_t = default_scalar; @@ -361,21 +361,49 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, } else { rowmap_t devOutRowmap; entries_t devOutEntries; + if (inPlace) { + // Start out with the output views containing the input, so that + // sort/merge is done in-place + devOutRowmap = rowmap_t("devOutRowmap", input.graph.row_map.extent(0)); + devOutEntries = + entries_t("devOutEntries", input.graph.entries.extent(0)); + Kokkos::deep_copy(devOutRowmap, input.graph.row_map); + Kokkos::deep_copy(devOutEntries, input.graph.entries); + } switch (howExecSpecified) { - case SortCrsTest::Instance: - KokkosSparse::sort_and_merge_graph(exec_space(), input.graph.row_map, - input.graph.entries, devOutRowmap, - devOutEntries); + case SortCrsTest::Instance: { + if (inPlace) { + KokkosSparse::sort_and_merge_graph(exec_space(), devOutRowmap, + devOutEntries, devOutRowmap, + devOutEntries); + } else { + KokkosSparse::sort_and_merge_graph( + exec_space(), input.graph.row_map, input.graph.entries, + devOutRowmap, devOutEntries); + } break; - case SortCrsTest::ExplicitType: - KokkosSparse::sort_and_merge_graph( - input.graph.row_map, input.graph.entries, devOutRowmap, - devOutEntries); + } + case SortCrsTest::ExplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_graph( + devOutRowmap, devOutEntries, devOutRowmap, devOutEntries); + } else { + KokkosSparse::sort_and_merge_graph( + input.graph.row_map, input.graph.entries, devOutRowmap, + devOutEntries); + } break; - case SortCrsTest::ImplicitType: - KokkosSparse::sort_and_merge_graph(input.graph.row_map, - input.graph.entries, devOutRowmap, - devOutEntries); + } + case SortCrsTest::ImplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_graph(devOutRowmap, devOutEntries, + devOutRowmap, devOutEntries); + } else { + KokkosSparse::sort_and_merge_graph(input.graph.row_map, + input.graph.entries, + devOutRowmap, devOutEntries); + } + } } outputGraph = graph_t(devOutEntries, devOutRowmap); } @@ -397,21 +425,53 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, rowmap_t devOutRowmap; entries_t devOutEntries; values_t devOutValues; + if (inPlace) { + // Start out with the output views containing the input, so that + // sort/merge is done in-place + devOutRowmap = rowmap_t("devOutRowmap", input.graph.row_map.extent(0)); + devOutEntries = + entries_t("devOutEntries", input.graph.entries.extent(0)); + devOutValues = values_t("devOutValues", input.values.extent(0)); + Kokkos::deep_copy(devOutRowmap, input.graph.row_map); + Kokkos::deep_copy(devOutEntries, input.graph.entries); + Kokkos::deep_copy(devOutValues, input.values); + } switch (howExecSpecified) { - case SortCrsTest::Instance: - KokkosSparse::sort_and_merge_matrix( - exec_space(), input.graph.row_map, input.graph.entries, - input.values, devOutRowmap, devOutEntries, devOutValues); + case SortCrsTest::Instance: { + if (inPlace) { + KokkosSparse::sort_and_merge_matrix( + exec_space(), devOutRowmap, devOutEntries, devOutValues, + devOutRowmap, devOutEntries, devOutValues); + } else { + KokkosSparse::sort_and_merge_matrix( + exec_space(), input.graph.row_map, input.graph.entries, + input.values, devOutRowmap, devOutEntries, devOutValues); + } break; - case SortCrsTest::ExplicitType: - KokkosSparse::sort_and_merge_matrix( - input.graph.row_map, input.graph.entries, input.values, - devOutRowmap, devOutEntries, devOutValues); + } + case SortCrsTest::ExplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_matrix( + devOutRowmap, devOutEntries, devOutValues, devOutRowmap, + devOutEntries, devOutValues); + } else { + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + } break; - case SortCrsTest::ImplicitType: - KokkosSparse::sort_and_merge_matrix( - input.graph.row_map, input.graph.entries, input.values, - devOutRowmap, devOutEntries, devOutValues); + } + case SortCrsTest::ImplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_matrix(devOutRowmap, devOutEntries, + devOutValues, devOutRowmap, + devOutEntries, devOutValues); + } else { + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + } + } } // and then construct output from views output = crsMat_t("Output", nrows, ncols, devOutValues.extent(0), @@ -493,10 +553,14 @@ TEST_F(TestCategory, common_sort_merge_crsmatrix) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) - continue; - testSortAndMerge(false, howExecSpecified, - doStructInterface, testCase); + for (int inPlace = 0; inPlace < 2; inPlace++) { + if (doStructInterface && + howExecSpecified == SortCrsTest::ExplicitType) + continue; + if (doStructInterface && inPlace) continue; + testSortAndMerge(false, howExecSpecified, + doStructInterface, inPlace, testCase); + } } } } @@ -507,10 +571,14 @@ TEST_F(TestCategory, common_sort_merge_crsgraph) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) - continue; - testSortAndMerge(true, howExecSpecified, - doStructInterface, testCase); + for (int inPlace = 0; inPlace < 2; inPlace++) { + if (doStructInterface && + howExecSpecified == SortCrsTest::ExplicitType) + continue; + if (doStructInterface && inPlace) continue; + testSortAndMerge(true, howExecSpecified, + doStructInterface, inPlace, testCase); + } } } } From bf0003bc126c0e0d4bb819ddf3a7cad72013c62e Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 15 Sep 2023 15:02:14 -0600 Subject: [PATCH 175/231] MDF: set default verbosity explicitly to avoid valgrind warnings --- sparse/src/KokkosSparse_mdf_handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index 03fd660b95..c6005bee12 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -58,7 +58,7 @@ struct MDF_handle { // elimination during the factorization. col_ind_type permutation, permutation_inv; - int verbosity; + int verbosity = 0; crs_matrix_type L, U; From f5ad8b8cbdc3400620e6cda499b98ce938fa30f2 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 18 Sep 2023 13:00:21 -0600 Subject: [PATCH 176/231] Don't assume the default memory space is used (#1969) (Fixes #1910) In the source and especially unit tests, assume the device type we want can have any memory space (not just the default memory space of the execution space). Lets TestExecSpace either be a Kokkos::Device type, or just an execution space (the old behavior). In all Cuda unit tests, use as the TestExecSpace if CudaUVMSpace is instantiated but CudaSpace is not. Otherwise, use . All other backends are unchanged, but this would also let us test devices like and in the future. --- .../unit_test/Test_Batched_SerialAxpy.hpp | 3 +- .../Test_Batched_SerialEigendecomposition.hpp | 3 +- .../unit_test/Test_Batched_SerialGemm.hpp | 3 +- .../unit_test/Test_Batched_SerialGesv.hpp | 3 +- .../Test_Batched_SerialInverseLU.hpp | 9 ++- .../dense/unit_test/Test_Batched_SerialLU.hpp | 3 +- .../unit_test/Test_Batched_SerialSVD.hpp | 9 ++- .../unit_test/Test_Batched_SerialSolveLU.hpp | 9 ++- .../unit_test/Test_Batched_SerialTrmm.hpp | 3 +- .../unit_test/Test_Batched_SerialTrsm.hpp | 3 +- .../unit_test/Test_Batched_SerialTrsv.hpp | 3 +- .../unit_test/Test_Batched_SerialTrtri.hpp | 3 +- .../dense/unit_test/Test_Batched_TeamAxpy.hpp | 5 +- .../dense/unit_test/Test_Batched_TeamGemm.hpp | 5 +- .../dense/unit_test/Test_Batched_TeamGesv.hpp | 5 +- .../unit_test/Test_Batched_TeamInverseLU.hpp | 5 +- .../dense/unit_test/Test_Batched_TeamLU.hpp | 4 +- .../unit_test/Test_Batched_TeamSolveLU.hpp | 11 ++- .../dense/unit_test/Test_Batched_TeamTrsm.hpp | 5 +- .../dense/unit_test/Test_Batched_TeamTrsv.hpp | 5 +- .../unit_test/Test_Batched_TeamVectorAxpy.hpp | 5 +- ...t_Batched_TeamVectorEigendecomposition.hpp | 2 +- .../unit_test/Test_Batched_TeamVectorGemm.hpp | 5 +- .../unit_test/Test_Batched_TeamVectorGesv.hpp | 5 +- .../unit_test/Test_Batched_TeamVectorQR.hpp | 3 +- ...atched_TeamVectorQR_WithColumnPivoting.hpp | 3 +- .../Test_Batched_TeamVectorSolveUTV.hpp | 3 +- .../Test_Batched_TeamVectorSolveUTV2.hpp | 3 +- .../unit_test/Test_Batched_TeamVectorUTV.hpp | 3 +- .../unit_test/Test_Batched_SerialGMRES.hpp | 3 +- .../unit_test/Test_Batched_SerialSpmv.hpp | 3 +- .../sparse/unit_test/Test_Batched_TeamCG.hpp | 5 +- .../unit_test/Test_Batched_TeamGMRES.hpp | 5 +- .../unit_test/Test_Batched_TeamSpmv.hpp | 3 +- .../unit_test/Test_Batched_TeamVectorCG.hpp | 5 +- .../Test_Batched_TeamVectorGMRES.hpp | 5 +- .../unit_test/Test_Batched_TeamVectorSpmv.hpp | 3 +- blas/unit_test/Test_Blas1_rotg.hpp | 20 ++--- blas/unit_test/Test_Blas1_rotmg.hpp | 10 +-- blas/unit_test/Test_Blas1_serial_setscal.hpp | 3 +- blas/unit_test/Test_Blas1_team_abs.hpp | 9 ++- blas/unit_test/Test_Blas1_team_axpby.hpp | 12 +-- blas/unit_test/Test_Blas1_team_axpy.hpp | 9 ++- blas/unit_test/Test_Blas1_team_dot.hpp | 12 +-- blas/unit_test/Test_Blas1_team_mult.hpp | 12 +-- blas/unit_test/Test_Blas1_team_nrm2.hpp | 6 +- blas/unit_test/Test_Blas1_team_scal.hpp | 12 +-- blas/unit_test/Test_Blas1_team_setscal.hpp | 5 +- blas/unit_test/Test_Blas1_team_update.hpp | 12 +-- blas/unit_test/Test_Blas2_gemv_util.hpp | 6 +- blas/unit_test/Test_Blas3_gemm.hpp | 20 ++--- blas/unit_test/Test_Blas_serial_axpy.hpp | 3 +- blas/unit_test/Test_Blas_serial_nrm2.hpp | 4 +- common/unit_test/Test_Common_ArithTraits.hpp | 16 ++-- common/unit_test/Test_Common_Sorting.hpp | 79 ++++++++++--------- graph/unit_test/Test_Graph_graph_color.hpp | 7 +- ode/unit_test/Test_ODE_Newton.hpp | 77 +++++++++--------- ode/unit_test/Test_ODE_RK.hpp | 27 ++++--- ode/unit_test/Test_ODE_RK_chem.hpp | 11 +-- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 4 +- sparse/src/KokkosSparse_sptrsv_supernode.hpp | 4 +- sparse/unit_test/Test_Sparse_SortCrs.hpp | 33 ++++---- sparse/unit_test/Test_Sparse_Transpose.hpp | 20 ++--- sparse/unit_test/Test_Sparse_coo2crs.hpp | 8 +- .../Test_Sparse_removeCrsMatrixZeros.hpp | 18 ++--- test_common/KokkosKernels_TestUtils.hpp | 11 +-- test_common/Test_Cuda.hpp | 12 ++- 67 files changed, 355 insertions(+), 290 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp index 2bde3f7fad..90ce5addc3 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp @@ -30,6 +30,7 @@ namespace Axpy { template struct Functor_TestBatchedSerialAxpy { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ViewType _X; const ViewType _Y; @@ -54,7 +55,7 @@ struct Functor_TestBatchedSerialAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _X.extent(0)); + Kokkos::RangePolicy policy(0, _X.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp index 5e90d5ae45..7eb2b89c83 100644 --- a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp @@ -34,6 +34,7 @@ namespace Test { typename ViewRank2Type, typename WorkViewType> struct Functor_TestBatchedSerialEigendecomposition { + using execution_space = typename DeviceType::execution_space; ViewRank3Type _A; ViewRank2Type _Er, _Ei; ViewRank3Type _UL, _UR; @@ -70,7 +71,7 @@ namespace Test { >::value ? "::ComplexFloat" : std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion( name.c_str() ); - Kokkos::RangePolicy policy(0, _A.extent(0)); + Kokkos::RangePolicy policy(0, _A.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index 8304657849..7f27fa7dcf 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -40,6 +40,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -66,7 +67,7 @@ struct Functor_TestBatchedSerialGemm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _c.extent(0)); + Kokkos::RangePolicy policy(0, _c.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp index 3b17d81d48..bb05fab3bb 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp @@ -35,6 +35,7 @@ namespace Gesv { template struct Functor_TestBatchedSerialGesv { + using execution_space = typename DeviceType::execution_space; const MatrixType _A; const MatrixType _tmp; const VectorType _X; @@ -61,7 +62,7 @@ struct Functor_TestBatchedSerialGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _X.extent(0)); + Kokkos::RangePolicy policy(0, _X.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index d3cbd6c024..23ded73e25 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -44,6 +44,7 @@ struct ParamTag { template struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -72,7 +73,7 @@ struct Functor_BatchedSerialGemm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _c.extent(0)); + Kokkos::RangePolicy policy(0, _c.extent(0)); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -80,6 +81,7 @@ struct Functor_BatchedSerialGemm { template struct Functor_BatchedSerialLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -100,7 +102,7 @@ struct Functor_BatchedSerialLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -109,6 +111,7 @@ struct Functor_BatchedSerialLU { template struct Functor_TestBatchedSerialInverseLU { + using execution_space = typename DeviceType::execution_space; AViewType _a; WViewType _w; @@ -130,7 +133,7 @@ struct Functor_TestBatchedSerialInverseLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::InverseLUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialLU.hpp b/batched/dense/unit_test/Test_Batched_SerialLU.hpp index 23b72893b2..87224aa888 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU.hpp @@ -32,6 +32,7 @@ namespace Test { template struct Functor_TestBatchedSerialLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -52,7 +53,7 @@ struct Functor_TestBatchedSerialLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index 5aa832f0df..fb56e25894 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -406,13 +406,14 @@ void GenerateTestData(ViewT data) { }); } -template +template void testIssue1786() { - using memory_space = typename ExeSpace::memory_space; + using execution_space = typename Device::execution_space; + using memory_space = typename Device::memory_space; constexpr int num_tests = 4; Kokkos::View matrices("data", num_tests); - GenerateTestData(matrices); + GenerateTestData(matrices); Kokkos::View Us("Us", matrices.extent(0)); Kokkos::View Ss("Ss", matrices.extent(0)); @@ -425,7 +426,7 @@ void testIssue1786() { "matrices_copy", matrices.extent(0)); // make a copy of the input data to avoid overwriting it Kokkos::deep_copy(matrices_copy, matrices); - auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); + auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); Kokkos::parallel_for( "polar decomposition", policy, KOKKOS_LAMBDA(int i) { auto matrix_copy = diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp index 48e8e5dead..43cb8fab2f 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp @@ -44,6 +44,7 @@ struct ParamTag { template struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -72,7 +73,7 @@ struct Functor_BatchedSerialGemm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _c.extent(0)); + Kokkos::RangePolicy policy(0, _c.extent(0)); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -80,6 +81,7 @@ struct Functor_BatchedSerialGemm { template struct Functor_BatchedSerialLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -100,7 +102,7 @@ struct Functor_BatchedSerialLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -109,6 +111,7 @@ struct Functor_BatchedSerialLU { template struct Functor_TestBatchedSerialSolveLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; @@ -130,7 +133,7 @@ struct Functor_TestBatchedSerialSolveLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::SolveLUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp index af38e62e4d..7a7e89ebf8 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp @@ -113,6 +113,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrmm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -138,7 +139,7 @@ struct Functor_TestBatchedSerialTrmm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp index c0ef098652..f9418a804a 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp @@ -40,6 +40,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrsm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -65,7 +66,7 @@ struct Functor_TestBatchedSerialTrsm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::RangePolicy policy(0, _b.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp index f05a6f7fa5..512dce3bce 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp @@ -39,6 +39,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrsv { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -64,7 +65,7 @@ struct Functor_TestBatchedSerialTrsv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::RangePolicy policy(0, _b.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp index 8f4ae64b7e..b09cadcb7e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp @@ -113,6 +113,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrtri { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -132,7 +133,7 @@ struct Functor_TestBatchedSerialTrtri { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for("Functor_TestBatchedSerialTrtri", policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp index 7941fc0284..b43b498607 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp @@ -30,6 +30,7 @@ namespace TeamAxpy { template struct Functor_TestBatchedTeamAxpy { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ViewType _X; const ViewType _Y; @@ -65,8 +66,8 @@ struct Functor_TestBatchedTeamAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp index 9023a009af..2d952889c9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp @@ -41,6 +41,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -73,8 +74,8 @@ struct Functor_TestBatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp index 89f67e2731..dc3b4e53fb 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp @@ -35,6 +35,7 @@ namespace TeamGesv { template struct Functor_TestBatchedTeamGesv { + using execution_space = typename DeviceType::execution_space; const MatrixType _A; const VectorType _X; const VectorType _B; @@ -62,8 +63,8 @@ struct Functor_TestBatchedTeamGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); using MatrixViewType = Kokkos::View struct Functor_BatchedTeamGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -82,8 +83,8 @@ struct Functor_BatchedTeamGemm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamLU.hpp b/batched/dense/unit_test/Test_Batched_TeamLU.hpp index 04e191b9cb..e20f3a7411 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU.hpp @@ -34,6 +34,8 @@ namespace TeamLU { template struct Functor_TestBatchedTeamLU { + using execution_space = typename DeviceType::execution_space; + ViewType _a; KOKKOS_INLINE_FUNCTION @@ -60,7 +62,7 @@ struct Functor_TestBatchedTeamLU { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp index 41287f9b52..445e10132f 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp @@ -44,6 +44,7 @@ struct ParamTag { template struct Functor_BatchedTeamGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -81,14 +82,15 @@ struct Functor_BatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; template struct Functor_BatchedTeamLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -113,7 +115,7 @@ struct Functor_BatchedTeamLU { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -121,6 +123,7 @@ struct Functor_BatchedTeamLU { template struct Functor_TestBatchedTeamSolveLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; @@ -146,7 +149,7 @@ struct Functor_TestBatchedTeamSolveLU { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::SolveLU").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp index 2f7781745d..523bd02df4 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp @@ -43,6 +43,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamTrsm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -74,8 +75,8 @@ struct Functor_TestBatchedTeamTrsm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp index bb00b78736..400e35deb8 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp @@ -41,6 +41,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamTrsv { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -72,8 +73,8 @@ struct Functor_TestBatchedTeamTrsv { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp index 5ea8a80717..fca0534b4b 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp @@ -30,6 +30,7 @@ namespace TeamVectorAxpy { template struct Functor_TestBatchedTeamVectorAxpy { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ViewType _X; const ViewType _Y; @@ -66,8 +67,8 @@ struct Functor_TestBatchedTeamVectorAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp index 69cab9c63c..bf907feb96 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp @@ -74,7 +74,7 @@ name_value_type = ( std::is_same::value ? "::Float" : "::ComplexFloat" : std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion( name.c_str() ); - Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO); + Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp index 327f28353e..f2f3bc217d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp @@ -36,6 +36,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamVector { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -68,8 +69,8 @@ struct Functor_TestBatchedTeamVector { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp index 2026f2f81d..ddb1a5c40d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp @@ -35,6 +35,7 @@ namespace TeamVectorGesv { template struct Functor_TestBatchedTeamVectorGesv { + using execution_space = typename DeviceType::execution_space; const MatrixType _A; const VectorType _X; const VectorType _B; @@ -63,8 +64,8 @@ struct Functor_TestBatchedTeamVectorGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); using MatrixViewType = Kokkos::View struct Functor_TestBatchedTeamVectorQR { + using execution_space = typename DeviceType::execution_space; MatrixViewType _a; VectorViewType _x, _b, _t; WorkViewType _w; @@ -99,7 +100,7 @@ struct Functor_TestBatchedTeamVectorQR { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index c86d4e86a8..09427aa25e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -35,6 +35,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { + using execution_space = typename DeviceType::execution_space; MatrixViewType _a; VectorViewType _x, _b, _t; PivotViewType _p; @@ -108,7 +109,7 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp index 29496c1b87..2f30c7d3c1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp @@ -35,6 +35,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorSolveUTV { + using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; PivViewType _p; VectorViewType _x, _b; @@ -121,7 +122,7 @@ struct Functor_TestBatchedTeamVectorSolveUTV { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp index 45d6093f2a..cf7084a92c 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp @@ -35,6 +35,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorSolveUTV2 { + using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; PivViewType _p; VectorViewType _x, _b; @@ -125,7 +126,7 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp index 527c93e059..eb45a70c89 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp @@ -34,6 +34,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorUTV { + using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; PivViewType _p; VectorViewType _x, _b; @@ -155,7 +156,7 @@ struct Functor_TestBatchedTeamVectorUTV { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp index 45b6a71f99..e28efb9b82 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp @@ -32,6 +32,7 @@ namespace GMRES { template struct Functor_TestBatchedSerialGMRES { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -85,7 +86,7 @@ struct Functor_TestBatchedSerialGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _D.extent(0) / _N_team); + Kokkos::RangePolicy policy(0, _D.extent(0) / _N_team); const int N = _D.extent(0); const int n = _X.extent(1); diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp index 338a93d0eb..05f2724c5b 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp @@ -41,6 +41,7 @@ template struct Functor_TestBatchedSerialSpmv { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ValuesViewType _D; const IntView _r; @@ -75,7 +76,7 @@ struct Functor_TestBatchedSerialSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _D.extent(0)); + Kokkos::RangePolicy policy(0, _D.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp index 41fa682bdd..b05f3db61f 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp @@ -31,6 +31,7 @@ namespace TeamCG { template struct Functor_TestBatchedTeamCG { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -79,8 +80,8 @@ struct Functor_TestBatchedTeamCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp index 2b7ab73790..de1a7f4fc2 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp @@ -32,6 +32,7 @@ namespace TeamGMRES { template struct Functor_TestBatchedTeamGMRES { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -91,8 +92,8 @@ struct Functor_TestBatchedTeamGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp index 5c077f75ed..a6c9ac7ea8 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp @@ -42,6 +42,7 @@ template struct Functor_TestBatchedTeamSpmv { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ValuesViewType _D; const IntView _r; @@ -99,7 +100,7 @@ struct Functor_TestBatchedTeamSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( + Kokkos::TeamPolicy policy( _D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp index abadf27953..3ffd68209b 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp @@ -31,6 +31,7 @@ namespace TeamVectorCG { template struct Functor_TestBatchedTeamVectorCG { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -81,8 +82,8 @@ struct Functor_TestBatchedTeamVectorCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp index f4f208a829..084b623aa2 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp @@ -32,6 +32,7 @@ namespace TeamVectorGMRES { template struct Functor_TestBatchedTeamVectorGMRES { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -91,8 +92,8 @@ struct Functor_TestBatchedTeamVectorGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp index 67d944b159..9cbba56370 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp @@ -42,6 +42,7 @@ template struct Functor_TestBatchedTeamVectorSpmv { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ValuesViewType _D; const IntView _r; @@ -106,7 +107,7 @@ struct Functor_TestBatchedTeamVectorSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( + Kokkos::TeamPolicy policy( ceil(static_cast(_D.extent(0)) / _N_team), Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); diff --git a/blas/unit_test/Test_Blas1_rotg.hpp b/blas/unit_test/Test_Blas1_rotg.hpp index 338eaa1b56..8efcb50182 100644 --- a/blas/unit_test/Test_Blas1_rotg.hpp +++ b/blas/unit_test/Test_Blas1_rotg.hpp @@ -16,12 +16,12 @@ #include namespace Test { -template -void test_rotg_impl(ExecSpace const& space, Scalar const a_in, - Scalar const b_in) { +template +void test_rotg_impl(typename Device::execution_space const& space, + Scalar const a_in, Scalar const b_in) { using magnitude_type = typename Kokkos::ArithTraits::mag_type; - using SViewType = Kokkos::View; - using MViewType = Kokkos::View; + using SViewType = Kokkos::View; + using MViewType = Kokkos::View; // const magnitude_type eps = Kokkos::ArithTraits::eps(); // const Scalar zero = Kokkos::ArithTraits::zero(); @@ -43,17 +43,17 @@ void test_rotg_impl(ExecSpace const& space, Scalar const a_in, } } // namespace Test -template +template int test_rotg() { const Scalar zero = Kokkos::ArithTraits::zero(); const Scalar one = Kokkos::ArithTraits::one(); const Scalar two = one + one; - ExecutionSpace space{}; + typename Device::execution_space space{}; - Test::test_rotg_impl(space, one, zero); - Test::test_rotg_impl(space, one / two, one / two); - Test::test_rotg_impl(space, 2.1 * one, 1.3 * one); + Test::test_rotg_impl(space, one, zero); + Test::test_rotg_impl(space, one / two, one / two); + Test::test_rotg_impl(space, 2.1 * one, 1.3 * one); return 1; } diff --git a/blas/unit_test/Test_Blas1_rotmg.hpp b/blas/unit_test/Test_Blas1_rotmg.hpp index f628505d97..ecfc3b6815 100644 --- a/blas/unit_test/Test_Blas1_rotmg.hpp +++ b/blas/unit_test/Test_Blas1_rotmg.hpp @@ -218,14 +218,10 @@ void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, } } // namespace Test -template +template int test_rotmg() { - Kokkos::View> - d1("d1"), d2("d2"), x1("x1"), y1("y1"); - Kokkos::View> - param("param"); + Kokkos::View d1("d1"), d2("d2"), x1("x1"), y1("y1"); + Kokkos::View param("param"); Kokkos::View ref_vals( "reference values"); diff --git a/blas/unit_test/Test_Blas1_serial_setscal.hpp b/blas/unit_test/Test_Blas1_serial_setscal.hpp index 80a0561d60..6c55ef65e3 100644 --- a/blas/unit_test/Test_Blas1_serial_setscal.hpp +++ b/blas/unit_test/Test_Blas1_serial_setscal.hpp @@ -87,7 +87,8 @@ struct Functor_TestBlasSerialMatUtil { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy + policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return 0; diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index d3f4f661d0..642be144e4 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_abs(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -109,7 +110,8 @@ void impl_test_team_abs(int N) { template void impl_test_team_abs_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -122,8 +124,7 @@ void impl_test_team_abs_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index e11b1e14a5..7052371bdb 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_axpby(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -48,8 +49,7 @@ void impl_test_team_axpby(int N) { view_stride_adapter y("Y", N); view_stride_adapter org_y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -116,7 +116,8 @@ void impl_test_team_axpby(int N) { template void impl_test_team_axpby_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -129,8 +130,7 @@ void impl_test_team_axpby_mv(int N, int K) { view_stride_adapter y("Y", N, K); view_stride_adapter org_y("Org_Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); diff --git a/blas/unit_test/Test_Blas1_team_axpy.hpp b/blas/unit_test/Test_Blas1_team_axpy.hpp index 5cff9d025e..87a4a80fba 100644 --- a/blas/unit_test/Test_Blas1_team_axpy.hpp +++ b/blas/unit_test/Test_Blas1_team_axpy.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_axpy(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -113,7 +114,8 @@ void impl_test_team_axpy(int N) { template void impl_test_team_axpy_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -126,8 +128,7 @@ void impl_test_team_axpy_mv(int N, int K) { view_stride_adapter y("Y", N, K); view_stride_adapter org_y("Org_Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); diff --git a/blas/unit_test/Test_Blas1_team_dot.hpp b/blas/unit_test/Test_Blas1_team_dot.hpp index 00c0940023..ec8dad838a 100644 --- a/blas/unit_test/Test_Blas1_team_dot.hpp +++ b/blas/unit_test/Test_Blas1_team_dot.hpp @@ -28,7 +28,8 @@ namespace Test { template void impl_test_team_dot(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -42,8 +43,7 @@ void impl_test_team_dot(int N) { view_stride_adapter a("a", N); view_stride_adapter b("b", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); @@ -161,7 +161,8 @@ void impl_test_team_dot(int N) { template void impl_test_team_dot_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -173,8 +174,7 @@ void impl_test_team_dot_mv(int N, int K) { view_stride_adapter a("A", N, K); view_stride_adapter b("B", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index f340ac2309..e8802a84a6 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_mult(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -50,8 +51,7 @@ void impl_test_team_mult(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -158,7 +158,8 @@ void impl_test_team_mult(int N) { template void impl_test_team_mult_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -174,8 +175,7 @@ void impl_test_team_mult_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); typename Kokkos::ArithTraits::mag_type const max_val = 10; Kokkos::fill_random(x.d_view, rand_pool, ScalarA(max_val)); diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index 4bc4836782..e5008441a4 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -28,7 +28,8 @@ namespace Test { template void impl_test_team_nrm2(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -39,8 +40,7 @@ void impl_test_team_nrm2(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index e0c109e1af..aad2e8723f 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_scal(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -49,8 +50,7 @@ void impl_test_team_scal(int N) { typename AT::mag_type zero = AT::abs(AT::zero()); typename AT::mag_type one = AT::abs(AT::one()); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); @@ -122,7 +122,8 @@ void impl_test_team_scal(int N) { template void impl_test_team_scal_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -135,8 +136,7 @@ void impl_test_team_scal_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); Kokkos::deep_copy(x.h_base, x.d_base); diff --git a/blas/unit_test/Test_Blas1_team_setscal.hpp b/blas/unit_test/Test_Blas1_team_setscal.hpp index ff593d3eeb..cae6e68be0 100644 --- a/blas/unit_test/Test_Blas1_team_setscal.hpp +++ b/blas/unit_test/Test_Blas1_team_setscal.hpp @@ -36,6 +36,7 @@ struct NaiveTag {}; template struct Functor_TestBlasTeamMatUtil { + using execution_space = typename DeviceType::execution_space; ScalarType _alpha; ViewType _a; @@ -97,8 +98,8 @@ struct Functor_TestBlasTeamMatUtil { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index 09b60440ae..35182b27fd 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_update(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -51,8 +52,7 @@ void impl_test_team_update(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -160,7 +160,8 @@ void impl_test_team_update(int N) { template void impl_test_team_update_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -175,8 +176,7 @@ void impl_test_team_update_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); diff --git a/blas/unit_test/Test_Blas2_gemv_util.hpp b/blas/unit_test/Test_Blas2_gemv_util.hpp index 99b4516cff..80bf76b0dd 100644 --- a/blas/unit_test/Test_Blas2_gemv_util.hpp +++ b/blas/unit_test/Test_Blas2_gemv_util.hpp @@ -239,7 +239,8 @@ struct GEMVTest { template static void run_views(const char trans, ViewTypeA A, ViewTypeX x, ViewTypeY y) { - Kokkos::TeamPolicy teams(1, 1); // just run on device + Kokkos::TeamPolicy teams( + 1, 1); // just run on device fill_inputs(A, x, y); ScalarType alpha = 3; // TODO: test also with zero alpha/beta ? ScalarType beta = 5; @@ -279,7 +280,8 @@ struct GEMVTest { ViewTypeY, Device, ScalarType>; op_type gemv_op(trans, alpha, A, x, beta, y); - Kokkos::parallel_for(Kokkos::TeamPolicy(1, 1), gemv_op); + Kokkos::parallel_for( + Kokkos::TeamPolicy(1, 1), gemv_op); const double eps = epsilon(ScalarY{}); EXPECT_NEAR_KK_REL_1DVIEW(y, y_ref, eps); diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index 13c52ec437..bc2d881600 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -81,7 +81,7 @@ void build_matrices(const int M, const int N, const int K, const typename ViewTypeA::value_type alpha, ViewTypeA& A, ViewTypeB& B, const typename ViewTypeA::value_type beta, ViewTypeC& C, ViewTypeC& Cref) { - using execution_space = TestExecSpace; + using execution_space = typename TestExecSpace::execution_space; using ScalarA = typename ViewTypeA::non_const_value_type; using ScalarB = typename ViewTypeB::non_const_value_type; using ScalarC = typename ViewTypeC::non_const_value_type; @@ -257,15 +257,16 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, } } -template +template void impl_test_stream_gemm_psge2(const int M, const int N, const int K, const Scalar alpha, const Scalar beta) { - using ViewTypeA = Kokkos::View; - using ViewTypeB = Kokkos::View; - using ViewTypeC = Kokkos::View; - using ScalarC = typename ViewTypeC::value_type; - using APT = Kokkos::ArithTraits; - using mag_type = typename APT::mag_type; + using execution_space = typename Device::execution_space; + using ViewTypeA = Kokkos::View; + using ViewTypeB = Kokkos::View; + using ViewTypeC = Kokkos::View; + using ScalarC = typename ViewTypeC::value_type; + using APT = Kokkos::ArithTraits; + using mag_type = typename APT::mag_type; const char tA[] = {"N"}; const char tB[] = {"N"}; @@ -336,6 +337,7 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, template void test_gemm() { + typedef typename TestExecSpace::execution_space execution_space; typedef Kokkos::View view_type_a; typedef Kokkos::View view_type_b; typedef Kokkos::View view_type_c; @@ -371,7 +373,7 @@ void test_gemm() { } } } - auto pool_size = TestExecSpace().concurrency(); + auto pool_size = execution_space().concurrency(); if (pool_size >= 2) { Test::impl_test_stream_gemm_psge2( 53, 42, 17, 4.5, diff --git a/blas/unit_test/Test_Blas_serial_axpy.hpp b/blas/unit_test/Test_Blas_serial_axpy.hpp index 48b417c96d..e6a571b7de 100644 --- a/blas/unit_test/Test_Blas_serial_axpy.hpp +++ b/blas/unit_test/Test_Blas_serial_axpy.hpp @@ -32,6 +32,7 @@ struct NaiveAxpyTag {}; template struct Functor_TestBlasSerialAxpy { + using execution_space = typename DeviceType::execution_space; ScalarType _alpha; ViewType _x; ViewType _y; @@ -71,7 +72,7 @@ struct Functor_TestBlasSerialAxpy { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return; diff --git a/blas/unit_test/Test_Blas_serial_nrm2.hpp b/blas/unit_test/Test_Blas_serial_nrm2.hpp index a4af218ff3..1e0a7a4ffa 100644 --- a/blas/unit_test/Test_Blas_serial_nrm2.hpp +++ b/blas/unit_test/Test_Blas_serial_nrm2.hpp @@ -70,7 +70,7 @@ struct Functor_TestBlasSerialNrm2 { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return; @@ -125,7 +125,7 @@ struct Functor_TestBlasSerialNrm2MV { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return; diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 8aa963b2ab..b5bf350847 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -119,7 +119,7 @@ struct HasTranscendentals { template class ArithTraitsTesterBase { public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -430,7 +430,7 @@ class ArithTraitsTesterTranscendentalBase typedef ArithTraitsTesterBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -509,7 +509,7 @@ class ArithTraitsTesterTranscendentalBase } public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -993,7 +993,7 @@ class ArithTraitsTesterComplexBase typedef ArithTraitsTesterTranscendentalBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1079,7 +1079,7 @@ class ArithTraitsTesterComplexBase typedef ArithTraitsTesterTranscendentalBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1217,7 +1217,7 @@ class ArithTraitsTesterFloatingPointBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1336,7 +1336,7 @@ class ArithTraitsTesterFloatingPointBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1417,7 +1417,7 @@ template class ArithTraitsTester : public ArithTraitsTesterFloatingPointBase { public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index 51ecf228a8..6969453395 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -178,17 +178,18 @@ struct TestSerialRadix2Functor { OrdView offsets; }; -template +template void testSerialRadixSort(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); KeyView keys("Radix sort testing data", n); fillRandom(keys); // Sort using std::sort on host to do correctness test @@ -196,11 +197,11 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { Kokkos::deep_copy(gold, keys); KeyView keysAux("Radix sort aux data", n); // Run the sorting on device in all sub-arrays in parallel - typedef Kokkos::RangePolicy range_policy; + typedef Kokkos::RangePolicy range_policy; Kokkos::parallel_for( range_policy(0, k), TestSerialRadixFunctor(keys, keysAux, counts, offsets)); - ExecSpace().fence(); + exec_space().fence(); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = @@ -218,18 +219,19 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { } } -template +template void testSerialRadixSort2(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); KeyView keys("Radix test keys", n); ValView data("Radix test data", n); // The keys are randomized @@ -239,12 +241,12 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { KeyView keysAux("Radix sort aux keys", n); ValView dataAux("Radix sort aux data", n); // Run the sorting on device in all sub-arrays in parallel - typedef Kokkos::RangePolicy range_policy; + typedef Kokkos::RangePolicy range_policy; // Deliberately using a weird number for vector length Kokkos::parallel_for(range_policy(0, k), TestSerialRadix2Functor( keys, keysAux, data, dataAux, counts, offsets)); - ExecSpace().fence(); + exec_space().fence(); // Sort using std::sort on host to do correctness test auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); @@ -312,30 +314,31 @@ struct TestTeamBitonic2Functor { OrdView offsets; }; -template +template void testTeamBitonicSort(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View ValView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); ValView data("Bitonic sort testing data", n); fillRandom(data); Kokkos::View gold("Host sorted", n); Kokkos::deep_copy(gold, data); // Run the sorting on device in all sub-arrays in parallel Kokkos::parallel_for( - Kokkos::TeamPolicy(k, Kokkos::AUTO()), + Kokkos::TeamPolicy(k, Kokkos::AUTO()), TestTeamBitonicFunctor(data, counts, offsets)); // Copy result to host auto dataHost = Kokkos::create_mirror_view(data); Kokkos::deep_copy(dataHost, data); // Sort using std::sort on host to do correctness test - ExecSpace().fence(); + exec_space().fence(); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = @@ -350,18 +353,19 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) { } } -template +template void testTeamBitonicSort2(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); KeyView keys("Bitonic test keys", n); ValView data("Bitonic test data", n); // The keys are randomized @@ -370,10 +374,10 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { Kokkos::deep_copy(gold, keys); // Run the sorting on device in all sub-arrays in parallel, just using vector // loops Deliberately using a weird number for vector length - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), + Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), TestTeamBitonic2Functor( keys, data, counts, offsets)); - ExecSpace().fence(); + exec_space().fence(); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = @@ -409,16 +413,17 @@ struct CheckSortedFunctor { View v; }; -template +template void testBitonicSort(size_t n) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View ValView; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckSortedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); @@ -444,19 +449,20 @@ struct CompareDescending { } }; -template +template void testBitonicSortDescending() { + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef char Scalar; typedef CompareDescending Comp; // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View ValView; size_t n = 12521; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); @@ -479,18 +485,19 @@ struct LexCompare { } }; -template +template void testBitonicSortLexicographic() { + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Coordinates Scalar; // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View ValView; size_t n = 9521; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); diff --git a/graph/unit_test/Test_Graph_graph_color.hpp b/graph/unit_test/Test_Graph_graph_color.hpp index 19ee9f600b..ee917e6ef4 100644 --- a/graph/unit_test/Test_Graph_graph_color.hpp +++ b/graph/unit_test/Test_Graph_graph_color.hpp @@ -96,9 +96,10 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< lno_view_t, lno_nnz_view_t, typename lno_view_t::non_const_type, - typename lno_nnz_view_t::non_const_type, device>( - numRows, input_mat.graph.row_map, input_mat.graph.entries, sym_xadj, - sym_adj); + typename lno_nnz_view_t::non_const_type, + typename device::execution_space>(numRows, input_mat.graph.row_map, + input_mat.graph.entries, sym_xadj, + sym_adj); size_type numentries = sym_adj.extent(0); scalar_view_t newValues("vals", numentries); diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp index 44a299b949..7cc3fd2a6d 100644 --- a/ode/unit_test/Test_ODE_Newton.hpp +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -75,17 +75,17 @@ struct NewtonSolve_wrapper { } }; -template +template void run_newton_test(const system_type& mySys, KokkosODE::Experimental::Newton_params& params, const scalar_type* const initial_val, const scalar_type* const solution) { + using execution_space = typename Device::execution_space; using newton_solver_status = KokkosODE::Experimental::newton_solver_status; - using vec_type = typename Kokkos::View; - using mat_type = typename Kokkos::View; + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; - Kokkos::View status("Newton status", - 1); + Kokkos::View status("Newton status", 1); vec_type x("solution vector", mySys.neqs), rhs("right hand side vector", mySys.neqs); @@ -136,10 +136,10 @@ void run_newton_test(const system_type& mySys, // x^2 - x - 2 = 0 // Solution: x = 2 or x = -1 // Derivative 2*x - 1 -template +template struct QuadraticEquation { - using vec_type = Kokkos::View; - using mat_type = Kokkos::View; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; static constexpr int neqs = 1; @@ -198,11 +198,12 @@ struct LogarithmicEquation { } }; -template +template void test_newton_status() { + using execution_space = typename Device::execution_space; using newton_solver_status = KokkosODE::Experimental::newton_solver_status; - using vec_type = typename Kokkos::View; - using mat_type = typename Kokkos::View; + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; double abs_tol, rel_tol; if (std::is_same_v) { @@ -220,7 +221,7 @@ void test_newton_status() { auto status_h = Kokkos::create_mirror_view(status); // Create the non-linear system and initialize data - QuadraticEquation my_system{}; + QuadraticEquation my_system{}; scalar_type initial_value[3] = {1.0, -0.5, 0.5}; #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -259,8 +260,9 @@ void test_newton_status() { } } -template +template void test_simple_problems() { + using execution_space = typename Device::execution_space; double abs_tol, rel_tol; if (std::is_same_v) { rel_tol = 10e-5; @@ -280,11 +282,11 @@ void test_simple_problems() { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "\nStarting Quadratic Equation problem" << std::endl; #endif - using system_type = QuadraticEquation; + using system_type = QuadraticEquation; system_type mySys{}; scalar_type initial_value[2] = {1.0, -0.5}, solution[2] = {2.0, -1.0}; for (int idx = 0; idx < 2; ++idx) { - run_newton_test( + run_newton_test( mySys, params, &(initial_value[idx]), &(solution[idx])); } #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -300,8 +302,8 @@ void test_simple_problems() { using system_type = TrigonometricEquation; system_type mySys{}; scalar_type initial_value[1] = {0.1}, solution[1] = {0.739085}; - run_newton_test( - mySys, params, initial_value, solution); + run_newton_test(mySys, params, + initial_value, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Trigonometric Equation problem" << std::endl; #endif @@ -317,8 +319,8 @@ void test_simple_problems() { scalar_type initial_value[1] = {static_cast(0.5)}, solution[1] = {static_cast(1.0) / static_cast(7.0)}; - run_newton_test( - mySys, params, initial_value, solution); + run_newton_test(mySys, params, + initial_value, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Logarithmic Equation problem" << std::endl; #endif @@ -340,10 +342,10 @@ void test_simple_problems() { // // Solution: x = 10.75/6 y = +/- sqrt(2.25 + 7.25/6) // ~ 1.7916666 ~ +/- 0.8887803753 -template +template struct CirclesIntersections { - using vec_type = Kokkos::View; - using mat_type = Kokkos::View; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; static constexpr int neqs = 2; @@ -374,10 +376,10 @@ struct CirclesIntersections { // x1~ 0.5176380902 y1~ 1.9318516525 // x2~ -0.5176380902 y2~ -1.9318516525 // x3~ -1.9318516525 y3~ -0.5176380902 -template +template struct CircleHyperbolaIntersection { - using vec_type = Kokkos::View; - using mat_type = Kokkos::View; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; static constexpr int neqs = 2; @@ -396,8 +398,9 @@ struct CircleHyperbolaIntersection { } }; -template +template void test_simple_systems() { + using execution_space = typename Device::execution_space; double abs_tol, rel_tol; if (std::is_same_v) { rel_tol = 10e-5; @@ -419,8 +422,8 @@ void test_simple_systems() { system_type mySys{}; scalar_type initial_values[2] = {1.5, 1.5}; scalar_type solution[2] = {10.75 / 6, 0.8887803753}; - run_newton_test( - mySys, params, initial_values, solution); + run_newton_test(mySys, params, + initial_values, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Circles Intersetcion problem" << std::endl; #endif @@ -432,8 +435,7 @@ void test_simple_systems() { std::cout << "\nStarting Circle/Hyperbola Intersetcion problem" << std::endl; #endif - using system_type = - CircleHyperbolaIntersection; + using system_type = CircleHyperbolaIntersection; system_type mySys{}; scalar_type init_vals[2] = {0.0, 1.0}; @@ -443,8 +445,8 @@ void test_simple_systems() { 4 + Kokkos::sqrt(static_cast(12.0)) / 2)), Kokkos::sqrt(static_cast( (4 + Kokkos::sqrt(static_cast(12.0))) / 2))}; - run_newton_test( - mySys, params, init_vals, solutions); + run_newton_test(mySys, params, init_vals, + solutions); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Circle/Hyperbola Intersetcion problem" << std::endl; #endif @@ -457,12 +459,13 @@ void test_simple_systems() { // happen within a FE/FD code. // //////////////////////////////////////////// -template +template void test_newton_on_device() { - using vec_type = Kokkos::View; - using mat_type = Kokkos::View; - using newton_params = KokkosODE::Experimental::Newton_params; - using system_type = CircleHyperbolaIntersection; + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + using newton_params = KokkosODE::Experimental::Newton_params; + using system_type = CircleHyperbolaIntersection; using newton_solver_status = KokkosODE::Experimental::newton_solver_status; double abs_tol, rel_tol; diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 2e6df4fd81..039e0211eb 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -174,11 +174,12 @@ void test_method(const std::string label, ode_type& my_ode, } // test_method -template +template void test_RK() { - using RK_type = KokkosODE::Experimental::RK_type; - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; + using execution_space = typename Device::execution_space; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; duho my_oscillator(1, 1, 4); const int neqs = my_oscillator.neqs; @@ -349,11 +350,12 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, } // test_method -template +template void test_convergence_rate() { - using RK_type = KokkosODE::Experimental::RK_type; - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; + using execution_space = typename Device::execution_space; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; duho my_oscillator(1, 1, 4); const int neqs = my_oscillator.neqs; @@ -463,11 +465,12 @@ void test_convergence_rate() { } } // test_convergence_rate -template +template void test_adaptivity() { - using RK_type = KokkosODE::Experimental::RK_type; - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; + using execution_space = typename Device::execution_space; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; duho my_oscillator(1, 1, 4); const int neqs = my_oscillator.neqs; diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 5abdd41d00..763394e1ec 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -89,12 +89,13 @@ struct chem_model_2 { } }; -template +template void test_chem() { - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; - using RK_type = KokkosODE::Experimental::RK_type; - using solver_type = KokkosODE::Experimental::RungeKutta; + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using RK_type = KokkosODE::Experimental::RK_type; + using solver_type = KokkosODE::Experimental::RungeKutta; { chem_model_1 chem_model; diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index e2a625e2a7..b7e7fa1650 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2915,7 +2915,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename execution_space::memory_space; + using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -3289,7 +3289,7 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename execution_space::memory_space; + using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; diff --git a/sparse/src/KokkosSparse_sptrsv_supernode.hpp b/sparse/src/KokkosSparse_sptrsv_supernode.hpp index 845efabc57..0be3abac08 100644 --- a/sparse/src/KokkosSparse_sptrsv_supernode.hpp +++ b/sparse/src/KokkosSparse_sptrsv_supernode.hpp @@ -1402,8 +1402,8 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, // If we are running KokkosKernels::trmm on device, // then we need to allocate a workspace on device using trmm_execution_space = typename KernelHandle::HandleExecSpace; - using trmm_memory_space = typename trmm_execution_space::memory_space; - using trmm_view_t = Kokkos::View; + using trmm_memory_space = typename KernelHandle::HandlePersistentMemorySpace; + using trmm_view_t = Kokkos::View; #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // use KokkosBlas::trmm only with CUBLAS (since deep-copy to host throws an // error) diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 6cf989accf..fe092461f3 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -41,15 +41,14 @@ enum : int { }; } -template +template void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, bool doStructInterface, int howExecSpecified) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; // Create a random matrix on device @@ -160,14 +159,13 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, } } -template +template void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { // This test is about bug #960. - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix, @@ -207,14 +205,13 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { } } -template +template void testSortAndMerge(bool justGraph, int howExecSpecified, bool doStructInterface, bool inPlace, int testCase) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + using size_type = default_size_type; + using lno_t = default_lno_t; + using scalar_t = default_scalar; + using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; using graph_t = typename crsMat_t::staticcrsgraph_type; diff --git a/sparse/unit_test/Test_Sparse_Transpose.hpp b/sparse/unit_test/Test_Sparse_Transpose.hpp index 0b9ba1a611..35f7a0516c 100644 --- a/sparse/unit_test/Test_Sparse_Transpose.hpp +++ b/sparse/unit_test/Test_Sparse_Transpose.hpp @@ -40,14 +40,13 @@ struct ExactCompare { V v2; }; -template +template void testTranspose(int numRows, int numCols, bool doValues) { + using exec_space = typename device_t::execution_space; using range_pol = Kokkos::RangePolicy; using scalar_t = default_scalar; using lno_t = default_lno_t; using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; using crsMat_t = typename KokkosSparse::CrsMatrix; using c_rowmap_t = typename crsMat_t::row_map_type; @@ -158,13 +157,11 @@ void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { EXPECT_EQ(size_type(0), valuesDiffs); } -template +template void testTransposeBsrRef() { using scalar_t = default_scalar; using lno_t = default_lno_t; using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; @@ -236,13 +233,12 @@ void testTransposeBsrRef() { CompareBsrMatrices(At, At_ref); } -template +template void testTransposeBsr(int numRows, int numCols, int blockSize) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using exec_space = typename device_t::execution_space; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; diff --git a/sparse/unit_test/Test_Sparse_coo2crs.hpp b/sparse/unit_test/Test_Sparse_coo2crs.hpp index 8a52a39220..a2ccd6bc62 100644 --- a/sparse/unit_test/Test_Sparse_coo2crs.hpp +++ b/sparse/unit_test/Test_Sparse_coo2crs.hpp @@ -197,10 +197,10 @@ void check_crs_matrix(CrsType crsMat, RowType row, ColType col, DataType data, } } -template +template void doCoo2Crs(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { - RandCooMat cooMat(m, n, m * n, min_val, - max_val); + RandCooMat cooMat(m, n, m * n, min_val, + max_val); auto randRow = cooMat.get_row(); auto randCol = cooMat.get_col(); auto randData = cooMat.get_data(); @@ -329,4 +329,4 @@ TEST_F(TestCategory, sparse_coo2crs_staticMatrix_edgeCases) { auto crsMatFsTs1 = KokkosSparse::coo2crs(m, n, row, col, data); check_crs_matrix(crsMatFsTs1, row_h, col_h, data); } -} // namespace Test \ No newline at end of file +} // namespace Test diff --git a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp index b5c57dbe49..bdd175558f 100644 --- a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp +++ b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp @@ -91,9 +91,7 @@ Matrix loadMatrixFromVectors(int numRows, int numCols, template void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { - using Offset = typename Matrix::size_type; - using Device = - Kokkos::Device; + using Offset = typename Matrix::size_type; bool haveHardcodedReference = true; switch (test) { case 0: { @@ -226,7 +224,8 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { if (haveHardcodedReference) { Matrix Afiltered_refimpl = removeMatrixZerosReference(A); bool referenceImplMatchesHardcoded = - Test::is_same_matrix(Afiltered_ref, Afiltered_refimpl); + Test::is_same_matrix(Afiltered_ref, + Afiltered_refimpl); ASSERT_TRUE(referenceImplMatchesHardcoded) << "Test case " << test << ": reference impl gave wrong answer!"; } @@ -236,15 +235,14 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { void testRemoveCrsMatrixZeros(int testCase) { using namespace TestRemoveCrsMatrixZeros; - using Device = - Kokkos::Device; - using Matrix = KokkosSparse::CrsMatrix; + using Matrix = + KokkosSparse::CrsMatrix; Matrix A, Afiltered_ref; getTestInput(testCase, A, Afiltered_ref); Matrix Afiltered_actual = KokkosSparse::removeCrsMatrixZeros(A); - bool matches = - Test::is_same_matrix(Afiltered_actual, Afiltered_ref); + bool matches = Test::is_same_matrix(Afiltered_actual, + Afiltered_ref); EXPECT_TRUE(matches) << "Test case " << testCase << ": matrix with zeros filtered out does not match reference."; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 95a3459699..330c00cde6 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -574,13 +574,14 @@ int string_compare_no_case(const std::string& str1, const std::string& str2) { /// /brief Coo matrix class for testing purposes. /// \tparam ScalarType /// \tparam LayoutType -/// \tparam ExeSpaceType -template +/// \tparam Device +template class RandCooMat { private: - using RowViewTypeD = Kokkos::View; - using ColViewTypeD = Kokkos::View; - using DataViewTypeD = Kokkos::View; + using ExeSpaceType = typename Device::execution_space; + using RowViewTypeD = Kokkos::View; + using ColViewTypeD = Kokkos::View; + using DataViewTypeD = Kokkos::View; RowViewTypeD __row_d; ColViewTypeD __col_d; DataViewTypeD __data_d; diff --git a/test_common/Test_Cuda.hpp b/test_common/Test_Cuda.hpp index 0bfe35718b..9230d7f935 100644 --- a/test_common/Test_Cuda.hpp +++ b/test_common/Test_Cuda.hpp @@ -32,6 +32,16 @@ class Cuda : public ::testing::Test { }; #define TestCategory Cuda -#define TestExecSpace Kokkos::Cuda + +using CudaSpaceDevice = Kokkos::Device; +using CudaUVMSpaceDevice = Kokkos::Device; + +// Prefer for any testing where only one exec space is used +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) && \ + !defined(KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE) +#define TestExecSpace CudaUVMSpaceDevice +#else +#define TestExecSpace CudaSpaceDevice +#endif #endif // TEST_CUDA_HPP From 3dafcbf48fd9b1d09fd5519a7a1a762ec2a51756 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 18 Sep 2023 13:02:53 -0600 Subject: [PATCH 177/231] Rename TestExecSpace to TestDevice --- .../Test_Batched_BatchedGemm_Complex.hpp | 48 +-- .../Test_Batched_BatchedGemm_Real.hpp | 80 ++--- .../Test_Batched_SerialAxpy_Complex.hpp | 4 +- .../Test_Batched_SerialAxpy_Real.hpp | 4 +- ..._Batched_SerialEigendecomposition_Real.hpp | 4 +- .../Test_Batched_SerialGemm_Complex.hpp | 32 +- .../Test_Batched_SerialGemm_Real.hpp | 76 ++-- .../Test_Batched_SerialGesv_Real.hpp | 10 +- .../Test_Batched_SerialInverseLU_Complex.hpp | 4 +- .../Test_Batched_SerialInverseLU_Real.hpp | 8 +- .../Test_Batched_SerialLU_Complex.hpp | 2 +- .../unit_test/Test_Batched_SerialLU_Real.hpp | 4 +- .../unit_test/Test_Batched_SerialSVD.hpp | 16 +- .../Test_Batched_SerialSolveLU_Complex.hpp | 4 +- .../Test_Batched_SerialSolveLU_Real.hpp | 8 +- .../Test_Batched_SerialTrmm_Complex.hpp | 108 +++--- .../Test_Batched_SerialTrmm_Real.hpp | 90 ++--- .../Test_Batched_SerialTrsm_Complex.hpp | 56 +-- .../Test_Batched_SerialTrsm_Real.hpp | 50 ++- .../Test_Batched_SerialTrsv_Complex.hpp | 24 +- .../Test_Batched_SerialTrsv_Real.hpp | 20 +- .../Test_Batched_SerialTrtri_Complex.hpp | 28 +- .../Test_Batched_SerialTrtri_Real.hpp | 20 +- .../Test_Batched_TeamAxpy_Complex.hpp | 4 +- .../unit_test/Test_Batched_TeamAxpy_Real.hpp | 4 +- .../Test_Batched_TeamGemm_Complex.hpp | 24 +- .../unit_test/Test_Batched_TeamGemm_Real.hpp | 48 +-- .../unit_test/Test_Batched_TeamGesv_Real.hpp | 10 +- .../Test_Batched_TeamInverseLU_Complex.hpp | 4 +- .../Test_Batched_TeamInverseLU_Real.hpp | 8 +- .../unit_test/Test_Batched_TeamLU_Complex.hpp | 2 +- .../unit_test/Test_Batched_TeamLU_Real.hpp | 4 +- .../Test_Batched_TeamSolveLU_Complex.hpp | 4 +- .../Test_Batched_TeamSolveLU_Real.hpp | 8 +- .../Test_Batched_TeamTrsm_Complex.hpp | 40 +-- .../unit_test/Test_Batched_TeamTrsm_Real.hpp | 40 +-- .../Test_Batched_TeamTrsv_Complex.hpp | 16 +- .../unit_test/Test_Batched_TeamTrsv_Real.hpp | 16 +- .../Test_Batched_TeamVectorAxpy_Complex.hpp | 5 +- .../Test_Batched_TeamVectorAxpy_Real.hpp | 4 +- ...ched_TeamVectorEigendecomposition_Real.hpp | 4 +- .../Test_Batched_TeamVectorGemm_Complex.hpp | 32 +- .../Test_Batched_TeamVectorGemm_Real.hpp | 64 ++-- .../Test_Batched_TeamVectorGesv_Real.hpp | 8 +- .../Test_Batched_TeamVectorQR_Real.hpp | 4 +- ...d_TeamVectorQR_WithColumnPivoting_Real.hpp | 6 +- .../Test_Batched_TeamVectorSolveUTV2_Real.hpp | 4 +- .../Test_Batched_TeamVectorSolveUTV_Real.hpp | 4 +- .../Test_Batched_TeamVectorUTV_Real.hpp | 4 +- .../Test_Batched_VectorArithmatic.hpp | 38 +- .../unit_test/Test_Batched_VectorLogical.hpp | 16 +- .../unit_test/Test_Batched_VectorMath.hpp | 16 +- .../unit_test/Test_Batched_VectorMisc.hpp | 16 +- .../unit_test/Test_Batched_VectorRelation.hpp | 12 +- .../unit_test/Test_Batched_VectorView.hpp | 14 +- .../Test_Batched_SerialGMRES_Real.hpp | 4 +- .../Test_Batched_SerialSpmv_Real.hpp | 4 +- .../unit_test/Test_Batched_TeamCG_Real.hpp | 4 +- .../unit_test/Test_Batched_TeamGMRES_Real.hpp | 4 +- .../unit_test/Test_Batched_TeamSpmv_Real.hpp | 4 +- .../Test_Batched_TeamVectorCG_Real.hpp | 4 +- .../Test_Batched_TeamVectorGMRES_Real.hpp | 4 +- .../Test_Batched_TeamVectorSpmv_Real.hpp | 4 +- blas/unit_test/Test_Blas1_abs.hpp | 21 +- blas/unit_test/Test_Blas1_asum.hpp | 8 +- blas/unit_test/Test_Blas1_axpby.hpp | 21 +- blas/unit_test/Test_Blas1_axpy.hpp | 21 +- blas/unit_test/Test_Blas1_dot.hpp | 21 +- blas/unit_test/Test_Blas1_iamax.hpp | 16 +- blas/unit_test/Test_Blas1_mult.hpp | 20 +- blas/unit_test/Test_Blas1_nrm1.hpp | 16 +- blas/unit_test/Test_Blas1_nrm2.hpp | 16 +- blas/unit_test/Test_Blas1_nrm2_squared.hpp | 16 +- blas/unit_test/Test_Blas1_nrm2w.hpp | 16 +- blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 16 +- blas/unit_test/Test_Blas1_nrminf.hpp | 16 +- blas/unit_test/Test_Blas1_reciprocal.hpp | 20 +- blas/unit_test/Test_Blas1_rot.hpp | 8 +- blas/unit_test/Test_Blas1_rotg.hpp | 8 +- blas/unit_test/Test_Blas1_rotm.hpp | 4 +- blas/unit_test/Test_Blas1_scal.hpp | 21 +- blas/unit_test/Test_Blas1_serial_setscal.hpp | 16 +- blas/unit_test/Test_Blas1_sum.hpp | 16 +- blas/unit_test/Test_Blas1_swap.hpp | 8 +- blas/unit_test/Test_Blas1_team_abs.hpp | 21 +- blas/unit_test/Test_Blas1_team_axpby.hpp | 20 +- blas/unit_test/Test_Blas1_team_axpy.hpp | 22 +- blas/unit_test/Test_Blas1_team_dot.hpp | 21 +- blas/unit_test/Test_Blas1_team_mult.hpp | 20 +- blas/unit_test/Test_Blas1_team_nrm2.hpp | 12 +- blas/unit_test/Test_Blas1_team_scal.hpp | 22 +- blas/unit_test/Test_Blas1_team_setscal.hpp | 16 +- blas/unit_test/Test_Blas1_team_update.hpp | 20 +- blas/unit_test/Test_Blas1_update.hpp | 20 +- blas/unit_test/Test_Blas2_gemv.hpp | 22 +- blas/unit_test/Test_Blas2_gemv_util.hpp | 7 +- blas/unit_test/Test_Blas2_ger.hpp | 14 +- blas/unit_test/Test_Blas2_syr.hpp | 12 +- blas/unit_test/Test_Blas3_gemm.hpp | 53 ++- blas/unit_test/Test_Blas3_trmm.hpp | 296 ++++++++-------- blas/unit_test/Test_Blas3_trsm.hpp | 332 +++++++++--------- blas/unit_test/Test_Blas_Newton.hpp | 4 +- blas/unit_test/Test_Blas_gesv.hpp | 34 +- blas/unit_test/Test_Blas_serial_axpy.hpp | 12 +- blas/unit_test/Test_Blas_serial_nrm2.hpp | 8 +- blas/unit_test/Test_Blas_trtri.hpp | 32 +- common/unit_test/Test_Common_ArithTraits.hpp | 2 +- common/unit_test/Test_Common_IOUtils.hpp | 2 +- common/unit_test/Test_Common_LowerBound.hpp | 10 +- .../Test_Common_PrintConfiguration.hpp | 2 +- common/unit_test/Test_Common_Sorting.hpp | 44 +-- common/unit_test/Test_Common_UpperBound.hpp | 10 +- .../unit_test/Test_Common_set_bit_count.hpp | 56 +-- graph/unit_test/Test_Graph_coarsen.hpp | 8 +- graph/unit_test/Test_Graph_graph_color.hpp | 8 +- .../Test_Graph_graph_color_deterministic.hpp | 8 +- .../Test_Graph_graph_color_distance2.hpp | 8 +- graph/unit_test/Test_Graph_mis2.hpp | 8 +- graph/unit_test/Test_Graph_rcm.hpp | 8 +- ode/unit_test/Test_ODE_Newton.hpp | 16 +- ode/unit_test/Test_ODE_RK.hpp | 6 +- ode/unit_test/Test_ODE_RK_chem.hpp | 2 +- sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 42 +-- sparse/unit_test/Test_Sparse_SortCrs.hpp | 44 +-- .../Test_Sparse_TestUtils_RandCsMat.hpp | 6 +- sparse/unit_test/Test_Sparse_Transpose.hpp | 38 +- sparse/unit_test/Test_Sparse_ccs2crs.hpp | 24 +- sparse/unit_test/Test_Sparse_coo2crs.hpp | 22 +- sparse/unit_test/Test_Sparse_crs2ccs.hpp | 24 +- sparse/unit_test/Test_Sparse_crs2coo.hpp | 4 +- sparse/unit_test/Test_Sparse_csc2csr.hpp | 20 +- .../unit_test/Test_Sparse_findRelOffset.hpp | 4 +- .../Test_Sparse_removeCrsMatrixZeros.hpp | 13 +- sparse/unit_test/Test_Sparse_spmv.hpp | 20 +- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 6 +- sparse/unit_test/Test_Sparse_trsv.hpp | 4 +- .../Test_Common_Test_All_Type_Combos.hpp | 66 ++-- test_common/Test_Cuda.hpp | 4 +- test_common/Test_HIP.hpp | 2 +- test_common/Test_OpenMP.hpp | 2 +- test_common/Test_OpenMPTarget.hpp | 2 +- test_common/Test_SYCL.hpp | 2 +- test_common/Test_Serial.hpp | 2 +- test_common/Test_Threads.hpp | 2 +- 144 files changed, 1512 insertions(+), 1599 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp index a2b9edf1e6..3c58f432ec 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp @@ -20,32 +20,32 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, @@ -54,32 +54,32 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } #endif @@ -90,7 +90,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { @@ -98,7 +98,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { @@ -106,7 +106,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { @@ -114,7 +114,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ @@ -124,7 +124,7 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { @@ -132,7 +132,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { @@ -140,7 +140,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { @@ -148,7 +148,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp index 00561e0317..62a4a291a8 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp @@ -25,7 +25,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { @@ -33,7 +33,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { @@ -41,7 +41,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { @@ -49,7 +49,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ @@ -58,7 +58,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { @@ -66,7 +66,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { @@ -74,7 +74,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { @@ -82,7 +82,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT @@ -98,32 +98,32 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { @@ -131,32 +131,32 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT @@ -167,28 +167,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { @@ -196,28 +196,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif @@ -228,28 +228,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_left) { ::Test::SharedParamTag; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { @@ -257,27 +257,27 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp index de677b1045..ed647f1e3b 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp @@ -16,11 +16,11 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_dcomplex) { - test_batched_axpy, + test_batched_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_double) { - test_batched_axpy, double>(); + test_batched_axpy, double>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp index 078e4bfa8f..3f1f6af2fd 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_float_float) { - test_batched_axpy(); + test_batched_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_double_double) { - test_batched_axpy(); + test_batched_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp index 2e82468f8b..ad4b790717 100644 --- a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp @@ -16,13 +16,13 @@ /* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_float ) { - test_batched_serial_eigendecomposition(); + test_batched_serial_eigendecomposition(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_double ) { - test_batched_serial_eigendecomposition(); + test_batched_serial_eigendecomposition(); } #endif */ diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp index 01988c9e51..f671292c98 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp @@ -21,39 +21,39 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_serial_gemm_nt_ct_dcomplex_dcomplex ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -62,39 +62,39 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_gemm,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_serial_gemm_nt_ct_dcomplex_double ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_gemm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp index c32556c229..6f074867d9 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp @@ -18,10 +18,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -29,10 +29,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -40,10 +40,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -51,10 +51,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -65,45 +65,37 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT @@ -112,29 +104,25 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif @@ -143,28 +131,28 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp index 2b2493506e..00161ecb70 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp @@ -15,20 +15,18 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_float) { - test_batched_gesv(); + test_batched_gesv(); } TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_float) { - test_batched_gesv(); + test_batched_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_double) { - test_batched_gesv(); + test_batched_gesv(); } TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_double) { - test_batched_gesv(); + test_batched_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp index 0be3375715..243ed21908 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_serial_inverselu_dcomplex) { // printf("Batched serial inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched serial inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp index 7eea2c9627..143db37b0c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_inverselu_float) { // printf("Batched serial inverse LU - float - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched serial inverse LU - float - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_inverselu_double) { // printf("Batched serial inverse LU - double - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched serial inverse LU - double - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp index 29936b7825..b07bece091 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp @@ -17,6 +17,6 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_lu_dcomplex) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu, algo_tag_type>(); + test_batched_lu, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp index a185e3b520..ace508fab2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_lu_float) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_lu_double) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index fb56e25894..099fa9219f 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -456,19 +456,19 @@ void testIssue1786() { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_svd_double) { // Test general SVD on a few different input sizes (full rank randomized) - testSVD(); - testSVD(); - testIssue1786(); - testIssue1786(); + testSVD(); + testSVD(); + testIssue1786(); + testIssue1786(); } #endif #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_svd_float) { // Test general SVD on a few different input sizes (full rank randomized) - testSVD(); - testSVD(); - testIssue1786(); - testIssue1786(); + testSVD(); + testSVD(); + testIssue1786(); + testIssue1786(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp index b0977189a4..6eaf9ca5aa 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_serial_solvelu_dcomplex) { // printf("Batched serial solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_solvelu, + test_batched_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched serial solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_solvelu, + test_batched_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp index f586e3b62c..37d768df98 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_solvelu_float) { // printf("Batched serial solveLU - float - algorithm type: Unblocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); // printf("Batched serial solveLU - float - algorithm type: Blocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_solvelu_double) { // printf("Batched serial solveLU - double - algorithm type: Unblocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); // printf("Batched serial solveLU - double - algorithm type: Blocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp index 5718a81694..8ab6e2810c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp @@ -22,8 +22,8 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_scomplex_scomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex) { @@ -77,8 +77,8 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex) { @@ -132,8 +132,8 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } #endif @@ -190,7 +190,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -200,7 +200,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -210,7 +210,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -220,7 +220,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -230,7 +230,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -240,7 +240,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -251,7 +251,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -261,7 +261,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -271,7 +271,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -281,7 +281,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -291,7 +291,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -301,7 +301,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -312,7 +312,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -322,7 +322,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -332,7 +332,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -342,7 +342,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -352,7 +352,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -362,7 +362,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp index c01ae8dbea..1cfc259dd3 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp @@ -22,8 +22,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_float_float) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float) { @@ -77,8 +71,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float) { @@ -132,8 +120,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } #endif @@ -190,7 +172,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double) { @@ -199,7 +181,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double) { @@ -208,7 +190,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double) { @@ -217,7 +199,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double) { @@ -226,7 +208,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double) { @@ -235,7 +217,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } // TRANSPOSE @@ -245,7 +227,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double) { @@ -254,7 +236,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double) { @@ -263,7 +245,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double) { @@ -272,7 +254,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double) { @@ -281,7 +263,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double) { @@ -290,7 +272,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } // CONJUGATE TRANSPOSE @@ -300,7 +282,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double) { @@ -309,7 +291,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double) { @@ -318,7 +300,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double) { @@ -327,7 +309,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double) { @@ -336,7 +318,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double) { @@ -345,7 +327,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp index d7c52ccd77..be0005a74c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp @@ -20,7 +20,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex) { @@ -28,7 +28,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { @@ -36,7 +36,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_dcomplex ) @@ -44,14 +44,14 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_dcomplex) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex) { @@ -59,7 +59,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // @@ -68,7 +68,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex) { @@ -76,7 +76,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { @@ -84,7 +84,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_dcomplex ) @@ -92,7 +92,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double) { @@ -100,46 +100,46 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_double ) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,double,param_tag_type,algo_tag_type>(); +// test_batched_trsm,double,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double) { @@ -147,29 +147,29 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_double ) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,double,param_tag_type,algo_tag_type>(); +// test_batched_trsm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp index c308071145..18b10a81e6 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp @@ -20,48 +20,42 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float) { @@ -69,32 +63,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } #endif @@ -104,7 +94,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double) { @@ -112,7 +102,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double) { @@ -120,7 +110,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double) { @@ -128,7 +118,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double) { @@ -136,7 +126,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double) { @@ -144,7 +134,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } // @@ -153,7 +143,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double) { @@ -161,7 +151,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double) { @@ -169,7 +159,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double) { @@ -177,7 +167,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp index 1af71e7104..a524b9f97e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp @@ -19,28 +19,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -48,28 +48,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp index 71eb62b559..be1bf77b9e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp @@ -19,29 +19,25 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } #endif @@ -50,28 +46,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp index db9d06ec06..0d8f2c72a6 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp @@ -20,33 +20,29 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } #endif @@ -56,7 +52,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -64,7 +60,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -72,7 +68,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -80,7 +76,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp index 48617506de..952994d207 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp @@ -20,29 +20,25 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } #endif @@ -52,28 +48,28 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp index 79c25ba9dc..b95b769fcc 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp @@ -16,11 +16,11 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_dcomplex) { - test_batched_team_axpy, + test_batched_team_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_double) { - test_batched_team_axpy, double>(); + test_batched_team_axpy, double>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp index 967bfa8e46..ac458d4a55 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_axpy_nt_float_float) { - test_batched_team_axpy(); + test_batched_team_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_axpy_nt_double_double) { - test_batched_team_axpy(); + test_batched_team_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp index ebe22e6e1d..09c7f3f2cc 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp @@ -22,7 +22,7 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -30,7 +30,7 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -38,7 +38,7 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -46,19 +46,19 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_dcomplex ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -67,39 +67,39 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_double ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp index f109fa4bf9..b1a5135018 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp @@ -18,10 +18,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -29,10 +29,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -40,10 +40,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -51,10 +51,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -65,10 +65,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -76,10 +76,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -87,10 +87,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -98,10 +98,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -112,28 +112,28 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } #endif @@ -143,28 +143,28 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp index 10d1be9c38..d0b04ea57c 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp @@ -15,22 +15,20 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_float) { - test_batched_team_gesv(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { - test_batched_team_gesv(); + test_batched_team_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_double) { - test_batched_team_gesv(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_double) { - test_batched_team_gesv(); + test_batched_team_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp index 97afe2c1ad..7eb918beef 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_team_inverselu_dcomplex) { // printf("Batched team inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched team inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp index 74c7efd25b..3939fdd13a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_inverselu_float) { // printf("Batched team inverse LU - float - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched team inverse LU - float - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_inverselu_double) { // printf("Batched team inverse LU - double - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched team inverse LU - double - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp index e05b521f8c..2c422397e7 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp @@ -17,6 +17,6 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_lu_dcomplex) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu, algo_tag_type>(); + test_batched_lu, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp index 5e7f05277e..5babaf996c 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_lu_float) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_lu_double) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp index 4076fd5c31..865f58ef43 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_team_solvelu_dcomplex) { // printf("Batched team solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_team_solvelu, + test_batched_team_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched team solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_team_solvelu, + test_batched_team_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp index 4882caabe8..73c55e8a93 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_solvelu_float) { // printf("Batched team solveLU - float - algorithm type: Unblocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); // printf("Batched team solveLU - float - algorithm type: Blocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_solvelu_double) { // printf("Batched team solveLU - double - algorithm type: Unblocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); // printf("Batched team solveLU - double - algorithm type: Blocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp index 7648017287..0cf2761922 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp @@ -20,7 +20,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_dcomplex) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -29,7 +29,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_dcomplex) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -38,7 +38,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_dcomplex) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -47,7 +47,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_dcomplex) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -56,7 +56,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_dcomplex) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -65,7 +65,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_dcomplex) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -75,7 +75,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -84,7 +84,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -93,7 +93,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -102,7 +102,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -112,7 +112,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double) { @@ -120,7 +120,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double) { @@ -128,7 +128,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double) { @@ -136,7 +136,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double) { @@ -144,7 +144,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double) { @@ -152,7 +152,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } // @@ -161,7 +161,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double) { @@ -169,7 +169,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double) { @@ -177,7 +177,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double) { @@ -185,7 +185,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp index d705b42a50..6757617ddd 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp @@ -20,7 +20,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_float_float) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float) { @@ -28,7 +28,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float) { @@ -36,7 +36,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float) { @@ -44,7 +44,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float) { @@ -52,7 +52,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float) { @@ -60,7 +60,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } // @@ -69,7 +69,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float) { @@ -77,7 +77,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float) { @@ -85,7 +85,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float) { @@ -93,7 +93,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } #endif @@ -104,7 +104,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_double_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double) { @@ -112,7 +112,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double) { @@ -120,7 +120,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double) { @@ -128,7 +128,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double) { @@ -136,7 +136,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double) { @@ -144,7 +144,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } // @@ -153,7 +153,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double) { @@ -161,7 +161,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double) { @@ -169,7 +169,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double) { @@ -177,7 +177,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp index d01e4b7f94..304e929462 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp @@ -19,49 +19,49 @@ // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_dcomplex_dcomplex ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_dcomplex_dcomplex ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_dcomplex_dcomplex ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_u_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp index d270a5f4f9..532ed87f4f 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp @@ -19,25 +19,25 @@ // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_float_float ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_float_float ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_float_float ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } #endif @@ -46,24 +46,24 @@ // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_double_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_double_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_double_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp index 161db2d3f5..b1f70a723e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp @@ -16,12 +16,11 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_dcomplex) { - test_batched_teamvector_axpy, + test_batched_teamvector_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_double) { - test_batched_teamvector_axpy, - double>(); + test_batched_teamvector_axpy, double>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp index f81f17046f..15570bc094 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_float_float) { - test_batched_teamvector_axpy(); + test_batched_teamvector_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_double_double) { - test_batched_teamvector_axpy(); + test_batched_teamvector_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp index 98ae616bbc..0a71de6bb7 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp @@ -16,13 +16,13 @@ /* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_float ) { - test_batched_teamvector_eigendecomposition(); + test_batched_teamvector_eigendecomposition(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_double ) { - test_batched_teamvector_eigendecomposition(); + test_batched_teamvector_eigendecomposition(); } #endif */ diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp index 9250896194..cc6cbdd511 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp @@ -19,8 +19,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -28,8 +28,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -37,8 +37,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -46,8 +46,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -59,8 +59,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -68,8 +68,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -77,8 +77,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -86,8 +86,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp index b8ad094f8e..e96bc1ac5c 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp @@ -19,8 +19,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_bhalf_bhalf) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -28,8 +28,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_bhalf_bhalf) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -37,8 +37,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_bhalf_bhalf) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -46,8 +46,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_bhalf_bhalf) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -59,8 +59,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -68,8 +68,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -77,8 +77,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -86,8 +86,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_half_half) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -99,32 +99,32 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_float_float) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif @@ -135,32 +135,32 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_double_double) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp index 24d71d2b89..66c6fb3691 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp @@ -15,22 +15,22 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_float) { - test_batched_teamvector_gesv(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_float) { - test_batched_teamvector_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_double) { - test_batched_teamvector_gesv(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_double) { - test_batched_teamvector_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp index 54c0388d17..d79d868bc1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_qr_float) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr(); + test_batched_qr(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_qr_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_qr_double) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr(); + test_batched_qr(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp index 81e010a895..35713ac7f1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp @@ -17,8 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_float) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr_with_columnpivoting(); + test_batched_qr_with_columnpivoting(); } #endif @@ -27,8 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_double) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr_with_columnpivoting(); + test_batched_qr_with_columnpivoting(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp index 52b8f263c3..c8e547d1d0 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_float) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv2(); + test_batched_solve_utv2(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_float) { #ifndef KOKKOS_ENABLE_SYCL TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_double) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv2(); + test_batched_solve_utv2(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp index b2bc52dafb..a3b5bcec29 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_float) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv(); + test_batched_solve_utv(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_float) { #ifndef KOKKOS_ENABLE_SYCL TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_double) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv(); + test_batched_solve_utv(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp index 980f0ebf75..7e9a8feafe 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_utv_float) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_utv(); + test_batched_utv(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_utv_float) { #ifndef KOKKOS_ENABLE_SYCL TEST_F(TestCategory, batched_scalar_teamvector_utv_double) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_utv(); + test_batched_utv(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp index 1006325f94..9d1205717f 100644 --- a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp @@ -265,96 +265,96 @@ int test_batched_complex_real_imag_value() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_arithmatic_simd_float3) { - test_batched_vector_arithmatic, 3>(); + test_batched_vector_arithmatic, 3>(); } TEST_F(TestCategory, batched_vector_arithmatic_simd_float4) { - test_batched_vector_arithmatic, 4>(); + test_batched_vector_arithmatic, 4>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_float8) { - test_batched_vector_arithmatic, 8>(); + test_batched_vector_arithmatic, 8>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_float16) { - test_batched_vector_arithmatic, 16>(); + test_batched_vector_arithmatic, 16>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_arithmatic_simd_double3) { - test_batched_vector_arithmatic, 3>(); + test_batched_vector_arithmatic, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_double4) { - test_batched_vector_arithmatic, 4>(); + test_batched_vector_arithmatic, 4>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_double8) { - test_batched_vector_arithmatic, 8>(); + test_batched_vector_arithmatic, 8>(); } #endif #define __DO_NOT_TEST__ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex3) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex4) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 4>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex8) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 8>(); } TEST_F(TestCategory, batched_vector_scomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_scomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_scomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex3) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex2) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex4) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 4>(); } TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp index 9393afd77b..5ab10bb5bd 100644 --- a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp @@ -114,37 +114,37 @@ int test_batched_vector_logical() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_logical_simd_float3) { - test_batched_vector_logical(); + test_batched_vector_logical(); } TEST_F(TestCategory, batched_vector_logical_simd_float8) { - test_batched_vector_logical(); + test_batched_vector_logical(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_logical_simd_double3) { - test_batched_vector_logical(); + test_batched_vector_logical(); } TEST_F(TestCategory, batched_vector_logical_simd_double4) { - test_batched_vector_logical(); + test_batched_vector_logical(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_logical_simd_scomplex3 ) { -// test_batched_vector_logical,3>(); +// test_batched_vector_logical,3>(); // } // TEST_F( TestCategory, batched_vector_logical_simd_scomplex4 ) { -// test_batched_vector_logical,4>(); +// test_batched_vector_logical,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_logical_simd_dcomplex3 ) { -// test_batched_vector_logical,3>(); +// test_batched_vector_logical,3>(); // } // TEST_F( TestCategory, batched_vector_logical_simd_dcomplex2 ) { -// test_batched_vector_logical,2>(); +// test_batched_vector_logical,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorMath.hpp b/batched/dense/unit_test/Test_Batched_VectorMath.hpp index d2aa9eb7bc..02c943d587 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMath.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMath.hpp @@ -157,19 +157,19 @@ int test_batched_vector_math() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_math_simd_float3) { - test_batched_vector_math, 3>(); + test_batched_vector_math, 3>(); } TEST_F(TestCategory, batched_vector_math_simd_float8) { - test_batched_vector_math, 8>(); + test_batched_vector_math, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_math_simd_double3) { - test_batched_vector_math, 3>(); + test_batched_vector_math, 3>(); } TEST_F(TestCategory, batched_vector_math_simd_double4) { - test_batched_vector_math, 4>(); + test_batched_vector_math, 4>(); } #endif @@ -178,20 +178,20 @@ TEST_F(TestCategory, batched_vector_math_simd_double4) { // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_math_simd_scomplex3 ) { // test_complex_pow(); -// test_batched_vector_math >,3>(); +// test_batched_vector_math >,3>(); // } // TEST_F( TestCategory, batched_vector_math_simd_scomplex4 ) { -// test_batched_vector_math >,4>(); +// test_batched_vector_math >,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_math_simd_dcomplex3 ) { // test_complex_pow(); -// test_batched_vector_math >,3>(); +// test_batched_vector_math >,3>(); // } // TEST_F( TestCategory, batched_vector_math_simd_dcomplex2 ) { -// test_batched_vector_math >,2>(); +// test_batched_vector_math >,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp index 70d0e10cd2..5f176ccba8 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp @@ -173,37 +173,37 @@ int test_batched_vector_misc() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_misc_simd_float3) { - test_batched_vector_misc, 3>(); + test_batched_vector_misc, 3>(); } TEST_F(TestCategory, batched_vector_misc_simd_float8) { - test_batched_vector_misc, 8>(); + test_batched_vector_misc, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_misc_simd_double3) { - test_batched_vector_misc, 3>(); + test_batched_vector_misc, 3>(); } TEST_F(TestCategory, batched_vector_misc_simd_double4) { - test_batched_vector_misc, 4>(); + test_batched_vector_misc, 4>(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_misc_simd_scomplex3 ) { -// test_batched_vector_misc >,3>(); +// test_batched_vector_misc >,3>(); // } // TEST_F( TestCategory, batched_vector_misc_simd_scomplex4 ) { -// test_batched_vector_misc >,4>(); +// test_batched_vector_misc >,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_misc_simd_dcomplex3 ) { -// test_batched_vector_misc >,3>(); +// test_batched_vector_misc >,3>(); // } // TEST_F( TestCategory, batched_vector_misc_simd_dcomplex2 ) { -// test_batched_vector_misc >,2>(); +// test_batched_vector_misc >,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp index 54eb2938e5..1aff1b2d0f 100644 --- a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp @@ -127,19 +127,19 @@ int test_batched_vector_relation() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_relation_simd_float3) { - test_batched_vector_relation, 3>(); + test_batched_vector_relation, 3>(); } TEST_F(TestCategory, batched_vector_relation_simd_float8) { - test_batched_vector_relation, 8>(); + test_batched_vector_relation, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_relation_simd_double3) { - test_batched_vector_relation, 3>(); + test_batched_vector_relation, 3>(); } TEST_F(TestCategory, batched_vector_relation_simd_double4) { - test_batched_vector_relation, 4>(); + test_batched_vector_relation, 4>(); } #endif @@ -147,14 +147,14 @@ TEST_F(TestCategory, batched_vector_relation_simd_double4) { // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_relation_simd_scomplex4 ) { -// test_batched_vector_relation +// test_batched_vector_relation // >,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_relation_simd_dcomplex2 ) { -// test_batched_vector_relation +// test_batched_vector_relation // >,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorView.hpp b/batched/dense/unit_test/Test_Batched_VectorView.hpp index 793c4ac3f3..74c7748cba 100644 --- a/batched/dense/unit_test/Test_Batched_VectorView.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorView.hpp @@ -356,31 +356,31 @@ int test_batched_vector_view() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_view_simd_float8) { - test_batched_vector_view, 8>(); + test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_view_simd_double4) { - test_batched_vector_view, 4>(); + test_batched_vector_view, 4>(); } TEST_F(TestCategory, batched_vector_view_simd_double8) { - test_batched_vector_view, 8>(); + test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_vector_view_simd_scomplex4) { - test_batched_vector_view >, 4>(); + test_batched_vector_view >, 4>(); } TEST_F(TestCategory, batched_vector_view_simd_scomplex8) { - test_batched_vector_view >, 8>(); + test_batched_vector_view >, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_vector_view_simd_dcomplex2) { - test_batched_vector_view >, 2>(); + test_batched_vector_view >, 2>(); } #if defined(KOKKOS_COMPILER_INTEL) && \ @@ -392,7 +392,7 @@ TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { } #else TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { - test_batched_vector_view >, 4>(); + test_batched_vector_view >, 4>(); } #endif // KOKKOS_COMPILER_INTEL #endif // KOKKOSKERNELS_INST_COMPLEX_DOUBLE diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp index 6201e29ebc..ccfe3c37d5 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { - test_batched_serial_GMRES(); + test_batched_serial_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { - test_batched_serial_GMRES(); + test_batched_serial_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp index bba455fef7..06c8c2695d 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_spmv_nt_float_float) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_spmv(); + test_batched_spmv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_spmv_nt_double_double) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_spmv(); + test_batched_spmv(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp index beb2a078e7..1bdb6bc95a 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_CG_float) { - test_batched_team_CG(); + test_batched_team_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_CG_double) { - test_batched_team_CG(); + test_batched_team_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp index f40452b952..f8aab13eec 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_GMRES_float) { - test_batched_team_GMRES(); + test_batched_team_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_GMRES_double) { - test_batched_team_GMRES(); + test_batched_team_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp index de3f6168a9..d815ee7b12 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_spmv_nt_float_float) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_team_spmv(); + test_batched_team_spmv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_spmv_nt_double_double) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_team_spmv(); + test_batched_team_spmv(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp index e3b34ca594..859a1a885c 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_CG_float) { - test_batched_teamvector_CG(); + test_batched_teamvector_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_CG_double) { - test_batched_teamvector_CG(); + test_batched_teamvector_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp index e36ee2b67c..53b740deaa 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_GMRES_float) { - test_batched_teamvector_GMRES(); + test_batched_teamvector_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_GMRES_double) { - test_batched_teamvector_GMRES(); + test_batched_teamvector_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp index 709dea5df1..05d6dcd316 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_spmv_nt_float_float) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_teamvector_spmv(); + test_batched_teamvector_spmv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_spmv_nt_double_double) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_teamvector_spmv(); + test_batched_teamvector_spmv(); } #endif diff --git a/blas/unit_test/Test_Blas1_abs.hpp b/blas/unit_test/Test_Blas1_abs.hpp index 8a2c7e3374..5bf3f55388 100644 --- a/blas/unit_test/Test_Blas1_abs.hpp +++ b/blas/unit_test/Test_Blas1_abs.hpp @@ -213,12 +213,12 @@ int test_abs_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_float"); - test_abs(); + test_abs(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_float"); - test_abs_mv(); + test_abs_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -228,12 +228,12 @@ TEST_F(TestCategory, abs_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); - test_abs(); + test_abs(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double"); - test_abs_mv(); + test_abs_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -243,13 +243,12 @@ TEST_F(TestCategory, abs_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); - test_abs, Kokkos::complex, TestExecSpace>(); + test_abs, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double"); - test_abs_mv, Kokkos::complex, - TestExecSpace>(); + test_abs_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -259,21 +258,21 @@ TEST_F(TestCategory, abs_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_int"); - test_abs(); + test_abs(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_int"); - test_abs_mv(); + test_abs_mv(); Kokkos::Profiling::popRegion(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -abs_double_int ) { test_abs (); +abs_double_int ) { test_abs (); } TEST_F( TestCategory, abs_double_mv_int ) { - test_abs_mv (); + test_abs_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_asum.hpp b/blas/unit_test/Test_Blas1_asum.hpp index e914c9a19a..65b5b2c063 100644 --- a/blas/unit_test/Test_Blas1_asum.hpp +++ b/blas/unit_test/Test_Blas1_asum.hpp @@ -98,7 +98,7 @@ int test_asum() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_float"); - test_asum(); + test_asum(); Kokkos::Profiling::popRegion(); } #endif @@ -108,7 +108,7 @@ TEST_F(TestCategory, asum_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_double"); - test_asum(); + test_asum(); Kokkos::Profiling::popRegion(); } #endif @@ -118,7 +118,7 @@ TEST_F(TestCategory, asum_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_complex_double"); - test_asum, TestExecSpace>(); + test_asum, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -128,7 +128,7 @@ TEST_F(TestCategory, asum_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_int"); - test_asum(); + test_asum(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 0d34464a84..8d5afb5f0b 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -227,12 +227,12 @@ int test_axpby_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_float"); - test_axpby(); + test_axpby(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_float"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -242,11 +242,11 @@ TEST_F(TestCategory, axpby_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double"); - test_axpby(); + test_axpby(); } TEST_F(TestCategory, axpby_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -256,13 +256,12 @@ TEST_F(TestCategory, axpby_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_complex_double"); - test_axpby, Kokkos::complex, TestExecSpace>(); + test_axpby, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_complex_double"); - test_axpby_mv, Kokkos::complex, - TestExecSpace>(); + test_axpby_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -272,12 +271,12 @@ TEST_F(TestCategory, axpby_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_int"); - test_axpby(); + test_axpby(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_int"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -286,12 +285,12 @@ TEST_F(TestCategory, axpby_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpby_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double_int"); - test_axpby(); + test_axpby(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_double_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double_int"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 8b21ff6dc5..76528f4a52 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -224,12 +224,12 @@ int test_axpy_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_float"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_float"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -239,12 +239,12 @@ TEST_F(TestCategory, axpy_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -254,13 +254,12 @@ TEST_F(TestCategory, axpy_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_complex_double"); - test_axpy, Kokkos::complex, TestExecSpace>(); + test_axpy, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_complex_double"); - test_axpy_mv, Kokkos::complex, - TestExecSpace>(); + test_axpy_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -270,12 +269,12 @@ TEST_F(TestCategory, axpy_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_int"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_int"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -284,12 +283,12 @@ TEST_F(TestCategory, axpy_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpy_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double_int"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_double_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double_int"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index d978cbafaa..911925476a 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -235,12 +235,12 @@ int test_dot_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_float"); - test_dot(); + test_dot(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_float"); - test_dot_mv(); + test_dot_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -250,12 +250,12 @@ TEST_F(TestCategory, dot_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_double"); - test_dot(); + test_dot(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_double"); - test_dot_mv(); + test_dot_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -265,13 +265,12 @@ TEST_F(TestCategory, dot_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_complex_double"); - test_dot, Kokkos::complex, TestExecSpace>(); + test_dot, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_complex_double"); - test_dot_mv, Kokkos::complex, - TestExecSpace>(); + test_dot_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -281,21 +280,21 @@ TEST_F(TestCategory, dot_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_int"); - test_dot(); + test_dot(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_int"); - test_dot_mv(); + test_dot_mv(); Kokkos::Profiling::popRegion(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -dot_double_int ) { test_dot (); +dot_double_int ) { test_dot (); } TEST_F( TestCategory, dot_mv_double_int ) { - test_dot_mv (); + test_dot_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index fcd896e22a..49f759958a 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -280,12 +280,12 @@ int test_iamax_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_float"); - test_iamax(); + test_iamax(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mvfloat"); - test_iamax_mv(); + test_iamax_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -295,12 +295,12 @@ TEST_F(TestCategory, iamax_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_double"); - test_iamax(); + test_iamax(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_double"); - test_iamax_mv(); + test_iamax_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -310,12 +310,12 @@ TEST_F(TestCategory, iamax_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_complex_double"); - test_iamax, TestExecSpace>(); + test_iamax, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_complex_double"); - test_iamax_mv, TestExecSpace>(); + test_iamax_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -325,12 +325,12 @@ TEST_F(TestCategory, iamax_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_int"); - test_iamax(); + test_iamax(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_int"); - test_iamax_mv(); + test_iamax_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index 0888c7a6b2..6555280f0d 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -275,12 +275,12 @@ int test_mult_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -290,12 +290,12 @@ TEST_F(TestCategory, mult_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -306,13 +306,13 @@ TEST_F(TestCategory, mult_mv_double) { TEST_F(TestCategory, mult_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_complex_double"); test_mult, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_complex_double"); test_mult_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -322,12 +322,12 @@ TEST_F(TestCategory, mult_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_int"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_int"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -336,12 +336,12 @@ TEST_F(TestCategory, mult_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, mult_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double_int"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double_int"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index 5c99895a49..f6938c5147 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -182,12 +182,12 @@ int test_nrm1_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_float"); - test_nrm1(); + test_nrm1(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_float"); - test_nrm1_mv(); + test_nrm1_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -197,12 +197,12 @@ TEST_F(TestCategory, nrm1_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_double"); - test_nrm1(); + test_nrm1(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_double"); - test_nrm1_mv(); + test_nrm1_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -212,12 +212,12 @@ TEST_F(TestCategory, nrm1_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_complex_double"); - test_nrm1, TestExecSpace>(); + test_nrm1, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_complex_double"); - test_nrm1_mv, TestExecSpace>(); + test_nrm1_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -227,12 +227,12 @@ TEST_F(TestCategory, nrm1_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_int"); - test_nrm1(); + test_nrm1(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_int"); - test_nrm1_mv(); + test_nrm1_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index 1264cfecf2..a9b3f7c10f 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -177,12 +177,12 @@ int test_nrm2_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_float"); - test_nrm2(); + test_nrm2(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_float"); - test_nrm2_mv(); + test_nrm2_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -192,12 +192,12 @@ TEST_F(TestCategory, nrm2_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_double"); - test_nrm2(); + test_nrm2(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_double"); - test_nrm2_mv(); + test_nrm2_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -207,12 +207,12 @@ TEST_F(TestCategory, nrm2_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_complex_double"); - test_nrm2, TestExecSpace>(); + test_nrm2, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_complex_double"); - test_nrm2_mv, TestExecSpace>(); + test_nrm2_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -222,12 +222,12 @@ TEST_F(TestCategory, nrm2_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_int"); - test_nrm2(); + test_nrm2(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_int"); - test_nrm2_mv(); + test_nrm2_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2_squared.hpp b/blas/unit_test/Test_Blas1_nrm2_squared.hpp index c218a12d39..09e4b3d45d 100644 --- a/blas/unit_test/Test_Blas1_nrm2_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2_squared.hpp @@ -182,12 +182,12 @@ int test_nrm2_squared_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_float"); - test_nrm2_squared(); + test_nrm2_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_float"); - test_nrm2_squared_mv(); + test_nrm2_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -197,12 +197,12 @@ TEST_F(TestCategory, nrm2_squared_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_double"); - test_nrm2_squared(); + test_nrm2_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_double"); - test_nrm2_squared_mv(); + test_nrm2_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -213,13 +213,13 @@ TEST_F(TestCategory, nrm2_squared_mv_double) { TEST_F(TestCategory, nrm2_squared_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2_squared_complex_double"); - test_nrm2_squared, TestExecSpace>(); + test_nrm2_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2_squared_mv_complex_double"); - test_nrm2_squared_mv, TestExecSpace>(); + test_nrm2_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -229,12 +229,12 @@ TEST_F(TestCategory, nrm2_squared_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_int"); - test_nrm2_squared(); + test_nrm2_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_int"); - test_nrm2_squared_mv(); + test_nrm2_squared_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index 89c1bdad45..48d8676fe4 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -185,12 +185,12 @@ int test_nrm2w_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_float"); - test_nrm2w(); + test_nrm2w(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_float"); - test_nrm2w_mv(); + test_nrm2w_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -200,12 +200,12 @@ TEST_F(TestCategory, nrm2w_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_double"); - test_nrm2w(); + test_nrm2w(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_double"); - test_nrm2w_mv(); + test_nrm2w_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -215,12 +215,12 @@ TEST_F(TestCategory, nrm2w_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_complex_double"); - test_nrm2w, TestExecSpace>(); + test_nrm2w, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_complex_double"); - test_nrm2w_mv, TestExecSpace>(); + test_nrm2w_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -230,12 +230,12 @@ TEST_F(TestCategory, nrm2w_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_int"); - test_nrm2w(); + test_nrm2w(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_int"); - test_nrm2w_mv(); + test_nrm2w_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index bacc733b1a..5a55d15fad 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -179,12 +179,12 @@ int test_nrm2w_squared_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_float"); - test_nrm2w_squared(); + test_nrm2w_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_float"); - test_nrm2w_squared_mv(); + test_nrm2w_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -194,12 +194,12 @@ TEST_F(TestCategory, nrm2w_squared_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_double"); - test_nrm2w_squared(); + test_nrm2w_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_double"); - test_nrm2w_squared_mv(); + test_nrm2w_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -210,13 +210,13 @@ TEST_F(TestCategory, nrm2w_squared_mv_double) { TEST_F(TestCategory, nrm2w_squared_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2w_squared_complex_double"); - test_nrm2w_squared, TestExecSpace>(); + test_nrm2w_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2w_squared_mv_complex_double"); - test_nrm2w_squared_mv, TestExecSpace>(); + test_nrm2w_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -226,12 +226,12 @@ TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_int"); - test_nrm2w_squared(); + test_nrm2w_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_int"); - test_nrm2w_squared_mv(); + test_nrm2w_squared_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 438db16895..91cc1c7502 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -173,12 +173,12 @@ int test_nrminf_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_float"); - test_nrminf(); + test_nrminf(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mvfloat"); - test_nrminf_mv(); + test_nrminf_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -188,12 +188,12 @@ TEST_F(TestCategory, nrminf_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_double"); - test_nrminf(); + test_nrminf(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_double"); - test_nrminf_mv(); + test_nrminf_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -203,12 +203,12 @@ TEST_F(TestCategory, nrminf_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_complex_double"); - test_nrminf, TestExecSpace>(); + test_nrminf, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_complex_double"); - test_nrminf_mv, TestExecSpace>(); + test_nrminf_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -218,12 +218,12 @@ TEST_F(TestCategory, nrminf_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_int"); - test_nrminf(); + test_nrminf(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_int"); - test_nrminf_mv(); + test_nrminf_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index 841725e6fd..c293fa04eb 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -210,12 +210,12 @@ int test_reciprocal_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_float"); - test_reciprocal(); + test_reciprocal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_float"); - test_reciprocal_mv(); + test_reciprocal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -225,12 +225,12 @@ TEST_F(TestCategory, reciprocal_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_double"); - test_reciprocal(); + test_reciprocal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_double"); - test_reciprocal_mv(); + test_reciprocal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -241,14 +241,14 @@ TEST_F(TestCategory, reciprocal_mv_double) { TEST_F(TestCategory, reciprocal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_complex_double"); test_reciprocal, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::reciprocal_mv_complex_double"); test_reciprocal_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -258,12 +258,12 @@ TEST_F(TestCategory, reciprocal_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_int"); - test_reciprocal(); + test_reciprocal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_int"); - test_reciprocal_mv(); + test_reciprocal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -271,10 +271,10 @@ TEST_F(TestCategory, reciprocal_mv_int) { /* #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -reciprocal_double_int ) { test_reciprocal (); +reciprocal_double_int ) { test_reciprocal (); } TEST_F( TestCategory, reciprocal_double_mv_int ) { - test_reciprocal_mv (); + test_reciprocal_mv (); } #endif */ diff --git a/blas/unit_test/Test_Blas1_rot.hpp b/blas/unit_test/Test_Blas1_rot.hpp index 7fe079d1aa..ab1f395923 100644 --- a/blas/unit_test/Test_Blas1_rot.hpp +++ b/blas/unit_test/Test_Blas1_rot.hpp @@ -75,7 +75,7 @@ int test_rot() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot(); + test_rot(); Kokkos::Profiling::popRegion(); } #endif @@ -85,7 +85,7 @@ TEST_F(TestCategory, rot_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot(); + test_rot(); Kokkos::Profiling::popRegion(); } #endif @@ -95,7 +95,7 @@ TEST_F(TestCategory, rot_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot, TestExecSpace>(); + test_rot, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -105,7 +105,7 @@ TEST_F(TestCategory, rot_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot, TestExecSpace>(); + test_rot, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_rotg.hpp b/blas/unit_test/Test_Blas1_rotg.hpp index 8efcb50182..27f9c3cf71 100644 --- a/blas/unit_test/Test_Blas1_rotg.hpp +++ b/blas/unit_test/Test_Blas1_rotg.hpp @@ -63,7 +63,7 @@ int test_rotg() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg(); + test_rotg(); Kokkos::Profiling::popRegion(); } #endif @@ -73,7 +73,7 @@ TEST_F(TestCategory, rotg_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg(); + test_rotg(); Kokkos::Profiling::popRegion(); } #endif @@ -83,7 +83,7 @@ TEST_F(TestCategory, rotg_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg, TestExecSpace>(); + test_rotg, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -93,7 +93,7 @@ TEST_F(TestCategory, rotg_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg, TestExecSpace>(); + test_rotg, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_rotm.hpp b/blas/unit_test/Test_Blas1_rotm.hpp index c9a09fd915..1f41fd06bc 100644 --- a/blas/unit_test/Test_Blas1_rotm.hpp +++ b/blas/unit_test/Test_Blas1_rotm.hpp @@ -166,7 +166,7 @@ int test_rotm() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); - test_rotm(); + test_rotm(); Kokkos::Profiling::popRegion(); } #endif @@ -176,7 +176,7 @@ TEST_F(TestCategory, rotm_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); - test_rotm(); + test_rotm(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 6c4f7b7f2a..a88ed646f1 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -229,12 +229,12 @@ int test_scal_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_float"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_float"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -244,12 +244,12 @@ TEST_F(TestCategory, scal_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -259,13 +259,12 @@ TEST_F(TestCategory, scal_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_complex_double"); - test_scal, Kokkos::complex, TestExecSpace>(); + test_scal, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_complex_double"); - test_scal_mv, Kokkos::complex, - TestExecSpace>(); + test_scal_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -275,12 +274,12 @@ TEST_F(TestCategory, scal_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_int"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_int"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -289,12 +288,12 @@ TEST_F(TestCategory, scal_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, scal_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double_int"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double_int"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_serial_setscal.hpp b/blas/unit_test/Test_Blas1_serial_setscal.hpp index 6c55ef65e3..cfbe4d602d 100644 --- a/blas/unit_test/Test_Blas1_serial_setscal.hpp +++ b/blas/unit_test/Test_Blas1_serial_setscal.hpp @@ -181,19 +181,19 @@ int test_blas_matutil() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, blas_scalar_serial_set_float_float) { - test_blas_matutil(); + test_blas_matutil(); } TEST_F(TestCategory, blas_scalar_serial_scale_float_float) { - test_blas_matutil(); + test_blas_matutil(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, blas_scalar_serial_set_double_double) { - test_blas_matutil(); + test_blas_matutil(); } TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { - test_blas_matutil(); + test_blas_matutil(); } #endif @@ -201,19 +201,19 @@ TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_dcomplex) { - test_blas_matutil, + test_blas_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_dcomplex) { - test_blas_matutil, + test_blas_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_double) { - test_blas_matutil, double, + test_blas_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_double) { - test_blas_matutil, double, + test_blas_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index cf119cbd00..34d52a7e4a 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -167,12 +167,12 @@ int test_sum_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_float"); - test_sum(); + test_sum(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_float"); - test_sum_mv(); + test_sum_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -182,12 +182,12 @@ TEST_F(TestCategory, sum_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_double"); - test_sum(); + test_sum(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_double"); - test_sum_mv(); + test_sum_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -197,12 +197,12 @@ TEST_F(TestCategory, sum_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_complex_double"); - test_sum, TestExecSpace>(); + test_sum, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_complex_double"); - test_sum_mv, TestExecSpace>(); + test_sum_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -212,12 +212,12 @@ TEST_F(TestCategory, sum_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_int"); - test_sum(); + test_sum(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_int"); - test_sum_mv(); + test_sum_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_swap.hpp b/blas/unit_test/Test_Blas1_swap.hpp index a7e4fff433..382c35947b 100644 --- a/blas/unit_test/Test_Blas1_swap.hpp +++ b/blas/unit_test/Test_Blas1_swap.hpp @@ -60,7 +60,7 @@ int test_swap() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_float"); - test_swap(); + test_swap(); Kokkos::Profiling::popRegion(); } #endif @@ -70,7 +70,7 @@ TEST_F(TestCategory, swap_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_double"); - test_swap(); + test_swap(); Kokkos::Profiling::popRegion(); } #endif @@ -80,7 +80,7 @@ TEST_F(TestCategory, swap_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_float"); - test_swap, TestExecSpace>(); + test_swap, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -90,7 +90,7 @@ TEST_F(TestCategory, swap_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_double"); - test_swap, TestExecSpace>(); + test_swap, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index 642be144e4..eca7657b55 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -289,10 +289,10 @@ int test_team_abs_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_float) { - test_team_abs(); + test_team_abs(); } TEST_F(TestCategory, team_abs_mv_float) { - test_team_abs_mv(); + test_team_abs_mv(); } #endif @@ -300,10 +300,10 @@ TEST_F(TestCategory, team_abs_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_double) { - test_team_abs(); + test_team_abs(); } TEST_F(TestCategory, team_abs_mv_double) { - test_team_abs_mv(); + test_team_abs_mv(); } #endif @@ -311,30 +311,29 @@ TEST_F(TestCategory, team_abs_mv_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_complex_double) { - test_team_abs, Kokkos::complex, - TestExecSpace>(); + test_team_abs, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_abs_mv_complex_double) { test_team_abs_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_abs_int) { test_team_abs(); } +TEST_F(TestCategory, team_abs_int) { test_team_abs(); } TEST_F(TestCategory, team_abs_mv_int) { - test_team_abs_mv(); + test_team_abs_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -team_abs_double_int ) { test_team_abs (); +team_abs_double_int ) { test_team_abs (); } TEST_F( TestCategory, team_abs_double_mv_int ) { - test_team_abs_mv (); + test_team_abs_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index 7052371bdb..5875f2bc1f 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -291,10 +291,10 @@ int test_team_axpby_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_float) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_mv_float) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif @@ -302,10 +302,10 @@ TEST_F(TestCategory, team_axpby_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_double) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_mv_double) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif @@ -314,11 +314,11 @@ TEST_F(TestCategory, team_axpby_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_complex_double) { test_team_axpby, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } TEST_F(TestCategory, team_axpby_mv_complex_double) { test_team_axpby_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif @@ -326,20 +326,20 @@ TEST_F(TestCategory, team_axpby_mv_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_int) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_mv_int) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_axpby_double_int) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_double_mv_int) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_axpy.hpp b/blas/unit_test/Test_Blas1_team_axpy.hpp index 87a4a80fba..a5ac6a9c66 100644 --- a/blas/unit_test/Test_Blas1_team_axpy.hpp +++ b/blas/unit_test/Test_Blas1_team_axpy.hpp @@ -285,10 +285,10 @@ int test_team_axpy_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_float) { - test_team_axpy(); + test_team_axpy(); } TEST_F(TestCategory, team_axpy_mv_float) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif @@ -296,10 +296,10 @@ TEST_F(TestCategory, team_axpy_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_double) { - test_team_axpy(); + test_team_axpy(); } TEST_F(TestCategory, team_axpy_mv_double) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif @@ -308,32 +308,30 @@ TEST_F(TestCategory, team_axpy_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_complex_double) { test_team_axpy, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } TEST_F(TestCategory, team_axpy_mv_complex_double) { test_team_axpy_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpy_int) { - test_team_axpy(); -} +TEST_F(TestCategory, team_axpy_int) { test_team_axpy(); } TEST_F(TestCategory, team_axpy_mv_int) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_axpy_double_int) { - test_team_axpy(); + test_team_axpy(); } TEST_F(TestCategory, team_axpy_double_mv_int) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_dot.hpp b/blas/unit_test/Test_Blas1_team_dot.hpp index ec8dad838a..26baf261fe 100644 --- a/blas/unit_test/Test_Blas1_team_dot.hpp +++ b/blas/unit_test/Test_Blas1_team_dot.hpp @@ -355,10 +355,10 @@ int test_team_dot_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_float) { - test_team_dot(); + test_team_dot(); } TEST_F(TestCategory, team_dot_mv_float) { - test_team_dot_mv(); + test_team_dot_mv(); } #endif @@ -366,10 +366,10 @@ TEST_F(TestCategory, team_dot_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_double) { - test_team_dot(); + test_team_dot(); } TEST_F(TestCategory, team_dot_mv_double) { - test_team_dot_mv(); + test_team_dot_mv(); } #endif @@ -377,30 +377,29 @@ TEST_F(TestCategory, team_dot_mv_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_complex_double) { - test_team_dot, Kokkos::complex, - TestExecSpace>(); + test_team_dot, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_dot_mv_complex_double) { test_team_dot_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_dot_int) { test_team_dot(); } +TEST_F(TestCategory, team_dot_int) { test_team_dot(); } TEST_F(TestCategory, team_dot_mv_int) { - test_team_dot_mv(); + test_team_dot_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -team_dot_double_int ) { test_team_dot (); +team_dot_double_int ) { test_team_dot (); } TEST_F( TestCategory, team_dot_mv_double_int ) { - test_team_dot_mv (); + test_team_dot_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index e8802a84a6..488e9ccf51 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -366,10 +366,10 @@ int test_team_mult_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_float) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_mv_float) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif @@ -377,10 +377,10 @@ TEST_F(TestCategory, team_mult_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_double) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_mv_double) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif @@ -389,11 +389,11 @@ TEST_F(TestCategory, team_mult_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_complex_double) { test_team_mult, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_mult_mv_complex_double) { test_team_mult_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } #endif @@ -401,20 +401,20 @@ TEST_F(TestCategory, team_mult_mv_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_int) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_mv_int) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_mult_double_int) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_double_mv_int) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index e5008441a4..12192032c9 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -130,31 +130,27 @@ int test_team_nrm2() { #if defined(KOKKOSKERNELS_INST_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_float) { - test_team_nrm2(); -} +TEST_F(TestCategory, team_nrm2_float) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_double) { - test_team_nrm2(); -} +TEST_F(TestCategory, team_nrm2_double) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_complex_double) { - test_team_nrm2, TestExecSpace>(); + test_team_nrm2, TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_int) { test_team_nrm2(); } +TEST_F(TestCategory, team_nrm2_int) { test_team_nrm2(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index aad2e8723f..212b1e09e9 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -358,10 +358,10 @@ int test_team_scal_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_float) { - test_team_scal(); + test_team_scal(); } TEST_F(TestCategory, team_scal_mv_float) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif @@ -369,10 +369,10 @@ TEST_F(TestCategory, team_scal_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_double) { - test_team_scal(); + test_team_scal(); } TEST_F(TestCategory, team_scal_mv_double) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif @@ -381,32 +381,30 @@ TEST_F(TestCategory, team_scal_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_complex_double) { test_team_scal, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } TEST_F(TestCategory, team_scal_mv_complex_double) { test_team_scal_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_scal_int) { - test_team_scal(); -} +TEST_F(TestCategory, team_scal_int) { test_team_scal(); } TEST_F(TestCategory, team_scal_mv_int) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_scal_double_int) { - test_team_scal(); + test_team_scal(); } TEST_F(TestCategory, team_scal_double_mv_int) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_setscal.hpp b/blas/unit_test/Test_Blas1_team_setscal.hpp index cae6e68be0..4d2499a466 100644 --- a/blas/unit_test/Test_Blas1_team_setscal.hpp +++ b/blas/unit_test/Test_Blas1_team_setscal.hpp @@ -194,19 +194,19 @@ int test_blas_team_matutil() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, blas_scalar_team_set_float_float) { - test_blas_team_matutil(); + test_blas_team_matutil(); } TEST_F(TestCategory, blas_scalar_team_scale_float_float) { - test_blas_team_matutil(); + test_blas_team_matutil(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, blas_scalar_team_set_double_double) { - test_blas_team_matutil(); + test_blas_team_matutil(); } TEST_F(TestCategory, blas_scalar_team_scale_double_double) { - test_blas_team_matutil(); + test_blas_team_matutil(); } #endif @@ -214,19 +214,19 @@ TEST_F(TestCategory, blas_scalar_team_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_team_set_dcomplex_dcomplex) { - test_blas_team_matutil, + test_blas_team_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_dcomplex) { - test_blas_team_matutil, + test_blas_team_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_team_set_dcomplex_double) { - test_blas_team_matutil, double, + test_blas_team_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_double) { - test_blas_team_matutil, double, + test_blas_team_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index 35182b27fd..cfc76455f3 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -370,10 +370,10 @@ int test_team_update_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_float) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_mv_float) { - test_team_update_mv(); + test_team_update_mv(); } #endif @@ -381,10 +381,10 @@ TEST_F(TestCategory, team_update_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_double) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_mv_double) { - test_team_update_mv(); + test_team_update_mv(); } #endif @@ -393,11 +393,11 @@ TEST_F(TestCategory, team_update_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_complex_double) { test_team_update, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_update_mv_complex_double) { test_team_update_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } #endif @@ -405,20 +405,20 @@ TEST_F(TestCategory, team_update_mv_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_int) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_mv_int) { - test_team_update_mv(); + test_team_update_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_update_double_int) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_double_mv_int) { - test_team_update_mv(); + test_team_update_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index 07445f595e..cfeddb9d3d 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -279,12 +279,12 @@ int test_update_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_float"); - test_update(); + test_update(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_float"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -294,11 +294,11 @@ TEST_F(TestCategory, update_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double"); - test_update(); + test_update(); } TEST_F(TestCategory, update_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -309,13 +309,13 @@ TEST_F(TestCategory, update_mv_double) { TEST_F(TestCategory, update_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_complex_double"); test_update, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_complex_double"); test_update_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -325,12 +325,12 @@ TEST_F(TestCategory, update_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_int"); - test_update(); + test_update(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_int"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -339,12 +339,12 @@ TEST_F(TestCategory, update_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, update_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double_int"); - test_update(); + test_update(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double_int"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index dc83ac82f5..518e7b8055 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -240,11 +240,11 @@ int test_gemv(const char* mode) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_float"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_float"); - test_gemv("T"); + test_gemv("T"); Kokkos::Profiling::popRegion(); } #endif @@ -254,11 +254,11 @@ TEST_F(TestCategory, gemv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_double"); - test_gemv("T"); + test_gemv("T"); Kokkos::Profiling::popRegion(); } #endif @@ -269,17 +269,17 @@ TEST_F(TestCategory, gemv_double) { TEST_F(TestCategory, gemv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_complex_double"); test_gemv, Kokkos::complex, - Kokkos::complex, TestExecSpace>("N"); + Kokkos::complex, TestDevice>("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_complex_double"); test_gemv, Kokkos::complex, - Kokkos::complex, TestExecSpace>("T"); + Kokkos::complex, TestDevice>("T"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double"); test_gemv, Kokkos::complex, - Kokkos::complex, TestExecSpace>("C"); + Kokkos::complex, TestDevice>("C"); Kokkos::Profiling::popRegion(); } #endif @@ -289,11 +289,11 @@ TEST_F(TestCategory, gemv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_int"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_int"); - test_gemv("T"); + test_gemv("T"); Kokkos::Profiling::popRegion(); } #endif @@ -302,11 +302,11 @@ TEST_F(TestCategory, gemv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, gemv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double_int"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); // Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemvt_double_int"); - // test_gemv ("T"); + // test_gemv ("T"); // Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas2_gemv_util.hpp b/blas/unit_test/Test_Blas2_gemv_util.hpp index 80bf76b0dd..e28310c8eb 100644 --- a/blas/unit_test/Test_Blas2_gemv_util.hpp +++ b/blas/unit_test/Test_Blas2_gemv_util.hpp @@ -23,8 +23,9 @@ namespace Test { -template ::value> +template ::value> using simd_vector = KokkosBatched::Vector, length>; @@ -320,7 +321,7 @@ struct GEMVTest { SCALAR_COEF) \ using PREFIX##_##NAME##_gemv_test = \ ::Test::GEMVTest<::Test::FACTORY, SCALAR_A, SCALAR_X, SCALAR_Y, \ - TestExecSpace, SCALAR_COEF>; \ + TestDevice, SCALAR_COEF>; \ TEST_F(TestCategory, PREFIX##_gemv_nt_##NAME) { \ PREFIX##_##NAME##_gemv_test::run("N"); \ } \ diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 6bf44f98f8..abafd79ac9 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -1613,7 +1613,7 @@ int test_ger(const std::string& /*caseName*/) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_float"); - test_ger("test case ger_float"); + test_ger("test case ger_float"); Kokkos::Profiling::popRegion(); } #endif @@ -1624,8 +1624,7 @@ TEST_F(TestCategory, ger_float) { TEST_F(TestCategory, ger_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_float"); test_ger, Kokkos::complex, - Kokkos::complex, TestExecSpace>( - "test case ger_complex_float"); + Kokkos::complex, TestDevice>("test case ger_complex_float"); Kokkos::Profiling::popRegion(); } #endif @@ -1635,7 +1634,7 @@ TEST_F(TestCategory, ger_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double"); - test_ger("test case ger_double"); + test_ger("test case ger_double"); Kokkos::Profiling::popRegion(); } #endif @@ -1646,8 +1645,7 @@ TEST_F(TestCategory, ger_double) { TEST_F(TestCategory, ger_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_double"); test_ger, Kokkos::complex, - Kokkos::complex, TestExecSpace>( - "test case ger_complex_double"); + Kokkos::complex, TestDevice>("test case ger_complex_double"); Kokkos::Profiling::popRegion(); } #endif @@ -1657,7 +1655,7 @@ TEST_F(TestCategory, ger_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_int"); - test_ger("test case ger_int"); + test_ger("test case ger_int"); Kokkos::Profiling::popRegion(); } #endif @@ -1666,7 +1664,7 @@ TEST_F(TestCategory, ger_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, ger_double_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int_float"); - test_ger("test case ger_double_int_float"); + test_ger("test case ger_double_int_float"); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index e7b5e7de3d..b57061ce8f 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -1777,7 +1777,7 @@ int test_syr(const std::string& /*caseName*/) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_float"); - test_syr("test case syr_float"); + test_syr("test case syr_float"); Kokkos::Profiling::popRegion(); } #endif @@ -1787,7 +1787,7 @@ TEST_F(TestCategory, syr_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_float"); - test_syr, Kokkos::complex, TestExecSpace>( + test_syr, Kokkos::complex, TestDevice>( "test case syr_complex_float"); Kokkos::Profiling::popRegion(); } @@ -1798,7 +1798,7 @@ TEST_F(TestCategory, syr_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double"); - test_syr("test case syr_double"); + test_syr("test case syr_double"); Kokkos::Profiling::popRegion(); } #endif @@ -1808,7 +1808,7 @@ TEST_F(TestCategory, syr_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_double"); - test_syr, Kokkos::complex, TestExecSpace>( + test_syr, Kokkos::complex, TestDevice>( "test case syr_complex_double"); Kokkos::Profiling::popRegion(); } @@ -1819,7 +1819,7 @@ TEST_F(TestCategory, syr_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int"); - test_syr("test case syr_int"); + test_syr("test case syr_int"); Kokkos::Profiling::popRegion(); } #endif @@ -1828,7 +1828,7 @@ TEST_F(TestCategory, syr_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int_float"); - test_syr("test case syr_int_float"); + test_syr("test case syr_int_float"); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index bc2d881600..cd91bc6d95 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -81,7 +81,7 @@ void build_matrices(const int M, const int N, const int K, const typename ViewTypeA::value_type alpha, ViewTypeA& A, ViewTypeB& B, const typename ViewTypeA::value_type beta, ViewTypeC& C, ViewTypeC& Cref) { - using execution_space = typename TestExecSpace::execution_space; + using execution_space = typename TestDevice::execution_space; using ScalarA = typename ViewTypeA::non_const_value_type; using ScalarB = typename ViewTypeB::non_const_value_type; using ScalarC = typename ViewTypeC::non_const_value_type; @@ -337,10 +337,10 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, template void test_gemm() { - typedef typename TestExecSpace::execution_space execution_space; - typedef Kokkos::View view_type_a; - typedef Kokkos::View view_type_b; - typedef Kokkos::View view_type_c; + typedef typename TestDevice::execution_space execution_space; + typedef Kokkos::View view_type_a; + typedef Kokkos::View view_type_b; + typedef Kokkos::View view_type_c; std::vector modes = {"N", "T"}; if (std::is_same>::value || std::is_same>::value) @@ -350,37 +350,32 @@ void test_gemm() { for (Scalar beta : betas) { for (auto amode : modes) { for (auto bmode : modes) { - Test::impl_test_gemm(amode, bmode, 0, 0, 0, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 0, 0, 0, alpha, beta); // BMK: N = 1 exercises the special GEMV code path in GEMM (currently, // only for modes N/N) - Test::impl_test_gemm(amode, bmode, 50, 1, 40, alpha, - beta); + Test::impl_test_gemm( + amode, bmode, 50, 1, 40, alpha, beta); // LBV: K = 0 exercise the quick return code path in GEMM - Test::impl_test_gemm(amode, bmode, 20, 14, 0, alpha, - beta); - Test::impl_test_gemm(amode, bmode, 13, 15, 17, alpha, - beta); - Test::impl_test_gemm(amode, bmode, 179, 15, 211, alpha, - beta); - Test::impl_test_gemm(amode, bmode, 12, 3071, 517, alpha, - beta); + Test::impl_test_gemm( + amode, bmode, 20, 14, 0, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 13, 15, 17, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 179, 15, 211, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 12, 3071, 517, alpha, beta); } } } auto pool_size = execution_space().concurrency(); if (pool_size >= 2) { - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 53, 42, 17, 4.5, 3.0); // General code path - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 13, 1, 17, 4.5, 3.0); // gemv based gemm code path - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 7, 13, 17, 4.5, 3.0); // dot based gemm code path } @@ -402,8 +397,8 @@ void test_gemm_enabled_layouts() { template void test_gemm_mixed_scalars() { - using Matrix1 = Kokkos::View; - using Matrix2 = Kokkos::View; + using Matrix1 = Kokkos::View; + using Matrix2 = Kokkos::View; const int dim1 = 400, dim2 = 1000; @@ -416,8 +411,8 @@ void test_gemm_mixed_scalars() { Kokkos::deep_copy(B, Kokkos::ArithTraits::one()); Kokkos::deep_copy(C, Kokkos::ArithTraits::one()); - KokkosBlas::gemm(TestExecSpace(), "N", "N", 1.0, D, A, 0.0, C); - KokkosBlas::gemm(TestExecSpace(), "N", "T", 1.0, C, D, 0.0, B); + KokkosBlas::gemm(TestDevice(), "N", "N", 1.0, D, A, 0.0, C); + KokkosBlas::gemm(TestDevice(), "N", "T", 1.0, C, D, 0.0, B); } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ diff --git a/blas/unit_test/Test_Blas3_trmm.hpp b/blas/unit_test/Test_Blas3_trmm.hpp index 188999c5e0..a186835aaa 100644 --- a/blas/unit_test/Test_Blas3_trmm.hpp +++ b/blas/unit_test/Test_Blas3_trmm.hpp @@ -259,42 +259,42 @@ int test_trmm(const char* mode, ScalarA alpha) { TEST_F(TestCategory, trmm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_float"); float alpha = 1.0f; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); alpha = 4.5f; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -305,42 +305,42 @@ TEST_F(TestCategory, trmm_float) { TEST_F(TestCategory, trmm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_double"); double alpha = 1.0; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); alpha = 4.5; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -351,194 +351,194 @@ TEST_F(TestCategory, trmm_double) { ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCU", 1.0); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } @@ -550,194 +550,194 @@ TEST_F(TestCategory, trmm_complex_double_RUCU_fourfive) { ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCU", + 1.0f); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } diff --git a/blas/unit_test/Test_Blas3_trsm.hpp b/blas/unit_test/Test_Blas3_trsm.hpp index 5edd175652..9a00f22263 100644 --- a/blas/unit_test/Test_Blas3_trsm.hpp +++ b/blas/unit_test/Test_Blas3_trsm.hpp @@ -261,42 +261,42 @@ int test_trsm(const char* mode, ScalarA alpha) { TEST_F(TestCategory, trsm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_float"); float alpha = 1.0f; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); alpha = 4.5f; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -307,42 +307,42 @@ TEST_F(TestCategory, trsm_float) { TEST_F(TestCategory, trsm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_double"); double alpha = 1.0; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); alpha = 4.5; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -353,73 +353,73 @@ TEST_F(TestCategory, trsm_double) { TEST_F(TestCategory, trsm_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_double"); Kokkos::complex alpha = 1.0; - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCU", alpha); alpha = Kokkos::complex(4.5, 0.0); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCU", alpha); Kokkos::Profiling::popRegion(); } @@ -431,74 +431,74 @@ TEST_F(TestCategory, trsm_complex_double) { TEST_F(TestCategory, trsm_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_float"); Kokkos::complex alpha = 1.0f; - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", + alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", + alpha); alpha = Kokkos::complex(4.5f, 0.0f); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", + alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", + alpha); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas_Newton.hpp b/blas/unit_test/Test_Blas_Newton.hpp index 001a6b2395..5bb6946e99 100644 --- a/blas/unit_test/Test_Blas_Newton.hpp +++ b/blas/unit_test/Test_Blas_Newton.hpp @@ -191,8 +191,8 @@ int test_intersection() { template int test_newton() { - Test::test_logistic(); - Test::test_intersection(); + Test::test_logistic(); + Test::test_intersection(); return 1; } diff --git a/blas/unit_test/Test_Blas_gesv.hpp b/blas/unit_test/Test_Blas_gesv.hpp index 710102137e..57ee6373bf 100644 --- a/blas/unit_test/Test_Blas_gesv.hpp +++ b/blas/unit_test/Test_Blas_gesv.hpp @@ -343,15 +343,15 @@ int test_gesv_mrhs(const char* mode) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_float"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_float"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -361,15 +361,15 @@ TEST_F(TestCategory, gesv_mrhs_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_double"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_double"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -379,16 +379,15 @@ TEST_F(TestCategory, gesv_mrhs_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_double"); - test_gesv, TestExecSpace>("N"); // No pivoting - test_gesv, TestExecSpace>("Y"); // Partial pivoting + test_gesv, TestDevice>("N"); // No pivoting + test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_double"); - test_gesv_mrhs, TestExecSpace>("N"); // No pivoting - test_gesv_mrhs, TestExecSpace>( - "Y"); // Partial pivoting + test_gesv_mrhs, TestDevice>("N"); // No pivoting + test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -398,16 +397,15 @@ TEST_F(TestCategory, gesv_mrhs_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_float"); - test_gesv, TestExecSpace>("N"); // No pivoting - test_gesv, TestExecSpace>("Y"); // Partial pivoting + test_gesv, TestDevice>("N"); // No pivoting + test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_float"); - test_gesv_mrhs, TestExecSpace>("N"); // No pivoting - test_gesv_mrhs, TestExecSpace>( - "Y"); // Partial pivoting + test_gesv_mrhs, TestDevice>("N"); // No pivoting + test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas_serial_axpy.hpp b/blas/unit_test/Test_Blas_serial_axpy.hpp index e6a571b7de..427925a3dc 100644 --- a/blas/unit_test/Test_Blas_serial_axpy.hpp +++ b/blas/unit_test/Test_Blas_serial_axpy.hpp @@ -156,35 +156,35 @@ int test_blas_serial_axpy() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, serial_axpy_float_float) { - test_blas_serial_axpy(); + test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, serial_axpy_double_double) { - test_blas_serial_axpy(); + test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, serial_axpy_dcomplex_dcomplex) { - test_blas_serial_axpy, + test_blas_serial_axpy, Kokkos::complex >(); } TEST_F(TestCategory, serial_axpy_dcomplex_double) { - test_blas_serial_axpy, double>(); + test_blas_serial_axpy, double>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, serial_axpy_fcomplex_fcomplex) { - test_blas_serial_axpy, + test_blas_serial_axpy, Kokkos::complex >(); } TEST_F(TestCategory, serial_axpy_fcomplex_float) { - test_blas_serial_axpy, float>(); + test_blas_serial_axpy, float>(); } #endif diff --git a/blas/unit_test/Test_Blas_serial_nrm2.hpp b/blas/unit_test/Test_Blas_serial_nrm2.hpp index 1e0a7a4ffa..147df52353 100644 --- a/blas/unit_test/Test_Blas_serial_nrm2.hpp +++ b/blas/unit_test/Test_Blas_serial_nrm2.hpp @@ -263,25 +263,25 @@ int test_blas_serial_nrm2() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, serial_nrm2_float_float) { - test_blas_serial_nrm2(); + test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, serial_nrm2_double_double) { - test_blas_serial_nrm2(); + test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, serial_nrm2_fcomplex_float) { - test_blas_serial_nrm2 >(); + test_blas_serial_nrm2 >(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { - test_blas_serial_nrm2 >(); + test_blas_serial_nrm2 >(); } #endif diff --git a/blas/unit_test/Test_Blas_trtri.hpp b/blas/unit_test/Test_Blas_trtri.hpp index 0bebb9edf0..aa12fa959b 100644 --- a/blas/unit_test/Test_Blas_trtri.hpp +++ b/blas/unit_test/Test_Blas_trtri.hpp @@ -363,10 +363,10 @@ int test_trtri(const char* mode) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_float"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -376,10 +376,10 @@ TEST_F(TestCategory, trtri_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_double"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -389,10 +389,10 @@ TEST_F(TestCategory, trtri_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_double"); - test_trtri, TestExecSpace>("UN"); - test_trtri, TestExecSpace>("UU"); - test_trtri, TestExecSpace>("LN"); - test_trtri, TestExecSpace>("LU"); + test_trtri, TestDevice>("UN"); + test_trtri, TestDevice>("UU"); + test_trtri, TestDevice>("LN"); + test_trtri, TestDevice>("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -402,10 +402,10 @@ TEST_F(TestCategory, trtri_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_float"); - test_trtri, TestExecSpace>("UN"); - test_trtri, TestExecSpace>("UU"); - test_trtri, TestExecSpace>("LN"); - test_trtri, TestExecSpace>("LU"); + test_trtri, TestDevice>("UN"); + test_trtri, TestDevice>("UU"); + test_trtri, TestDevice>("LN"); + test_trtri, TestDevice>("LU"); Kokkos::Profiling::popRegion(); } #endif diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index b5bf350847..37bb8fce80 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -1703,6 +1703,6 @@ void test_ArithTraits() { success = runAllArithTraitsHostTests(out, 0); EXPECT_TRUE(success); } -TEST_F(TestCategory, common_ArithTraits) { test_ArithTraits(); } +TEST_F(TestCategory, common_ArithTraits) { test_ArithTraits(); } #endif // KOKKOS_ARITHTRAITSTEST_HPP diff --git a/common/unit_test/Test_Common_IOUtils.hpp b/common/unit_test/Test_Common_IOUtils.hpp index c4e031adf4..1219304421 100644 --- a/common/unit_test/Test_Common_IOUtils.hpp +++ b/common/unit_test/Test_Common_IOUtils.hpp @@ -70,6 +70,6 @@ void testPrintView() { "[2x2 multi-vector]\n"); } -TEST_F(TestCategory, common_print_view) { testPrintView(); } +TEST_F(TestCategory, common_print_view) { testPrintView(); } #endif // KOKKOSKERNELS_IOTEST_HPP diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index 3ff27da23c..476a44abf4 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -228,31 +228,31 @@ void test_lower_bound() { #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int, TestExecSpace) +EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int64_t, TestExecSpace) +EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(size_t, TestExecSpace) +EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, TestExecSpace) +EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, TestExecSpace) +EXECUTE_TEST(double, TestDevice) #endif #undef EXECUTE_TEST diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index 07a55e152b..6638c6e398 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -57,7 +57,7 @@ void testPrintConfiguration() { } TEST_F(TestCategory, common_print_configuration) { - testPrintConfiguration(); + testPrintConfiguration(); } #endif // KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index 6969453395..eea69437d0 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -508,8 +508,8 @@ TEST_F(TestCategory, common_serial_radix) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 100; for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) { - testSerialRadixSort(numArrays, arrayMax); - testSerialRadixSort(numArrays, arrayMax); + testSerialRadixSort(numArrays, arrayMax); + testSerialRadixSort(numArrays, arrayMax); } } @@ -518,10 +518,10 @@ TEST_F(TestCategory, common_serial_radix2) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 100; for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) { - testSerialRadixSort2(numArrays, arrayMax); - testSerialRadixSort2(numArrays, arrayMax); - testSerialRadixSort2>(numArrays, - arrayMax); + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2>(numArrays, + arrayMax); } } @@ -530,8 +530,8 @@ TEST_F(TestCategory, common_team_bitonic) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 20; for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort(numArrays, arrayMax); - testTeamBitonicSort(numArrays, arrayMax); + testTeamBitonicSort(numArrays, arrayMax); + testTeamBitonicSort(numArrays, arrayMax); } } @@ -540,27 +540,27 @@ TEST_F(TestCategory, common_team_bitonic2) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 20; for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2>(numArrays, - arrayMax); + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2>(numArrays, + arrayMax); } } TEST_F(TestCategory, common_device_bitonic) { // Test device-level bitonic with some larger arrays - testBitonicSort(243743); - testBitonicSort(2157); - testBitonicSort(424); - testBitonicSort(5); - testBitonicSort(92314); - testBitonicSort(123); - testBitonicSort(60234); - testBitonicSort(53); + testBitonicSort(243743); + testBitonicSort(2157); + testBitonicSort(424); + testBitonicSort(5); + testBitonicSort(92314); + testBitonicSort(123); + testBitonicSort(60234); + testBitonicSort(53); // Test custom comparator: ">" instead of "<" to sort descending - testBitonicSortDescending(); + testBitonicSortDescending(); // Test custom comparator: lexicographic comparison of 3-element struct - testBitonicSortLexicographic(); + testBitonicSortLexicographic(); } #endif diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index a6d3b24d84..9e431285fd 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -219,31 +219,31 @@ void test_upper_bound() { #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int, TestExecSpace) +EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int64_t, TestExecSpace) +EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(size_t, TestExecSpace) +EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, TestExecSpace) +EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, TestExecSpace) +EXECUTE_TEST(double, TestDevice) #endif #undef EXECUTE_TEST diff --git a/common/unit_test/Test_Common_set_bit_count.hpp b/common/unit_test/Test_Common_set_bit_count.hpp index dd65ced821..6e2c6e80b6 100644 --- a/common/unit_test/Test_Common_set_bit_count.hpp +++ b/common/unit_test/Test_Common_set_bit_count.hpp @@ -218,35 +218,35 @@ void test_ffs() { } TEST_F(TestCategory, common_set_bit_count) { - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); } TEST_F(TestCategory, common_ffs) { - test_ffs(); - test_ffs(); - test_ffs(); - test_ffs(); - - test_ffs(); - test_ffs(); - test_ffs(); - test_ffs(); - - test_ffs(); - test_ffs(); - test_ffs(); - test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); + + test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); + + test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); } diff --git a/graph/unit_test/Test_Graph_coarsen.hpp b/graph/unit_test/Test_Graph_coarsen.hpp index 6f0eda42f3..95f1533c88 100644 --- a/graph/unit_test/Test_Graph_coarsen.hpp +++ b/graph/unit_test/Test_Graph_coarsen.hpp @@ -422,7 +422,7 @@ void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #endif @@ -430,21 +430,21 @@ EXECUTE_TEST(double, int, int, TestExecSpace) defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_graph_color.hpp b/graph/unit_test/Test_Graph_graph_color.hpp index ee917e6ef4..5d4eec03ca 100644 --- a/graph/unit_test/Test_Graph_graph_color.hpp +++ b/graph/unit_test/Test_Graph_graph_color.hpp @@ -194,28 +194,28 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp index 3b3cadd71b..7bd3c4cd40 100644 --- a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp +++ b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp @@ -247,28 +247,28 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_graph_color_distance2.hpp b/graph/unit_test/Test_Graph_graph_color_distance2.hpp index d022a0d2d6..44ddaed0bf 100644 --- a/graph/unit_test/Test_Graph_graph_color_distance2.hpp +++ b/graph/unit_test/Test_Graph_graph_color_distance2.hpp @@ -343,28 +343,28 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_mis2.hpp b/graph/unit_test/Test_Graph_mis2.hpp index faaea3b155..c6fb7562e7 100644 --- a/graph/unit_test/Test_Graph_mis2.hpp +++ b/graph/unit_test/Test_Graph_mis2.hpp @@ -274,7 +274,7 @@ void test_mis2_coarsening_zero_rows() { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #endif @@ -282,21 +282,21 @@ EXECUTE_TEST(double, int, int, TestExecSpace) defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp index 861634071e..2e05554d2d 100644 --- a/graph/unit_test/Test_Graph_rcm.hpp +++ b/graph/unit_test/Test_Graph_rcm.hpp @@ -151,28 +151,28 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp index 7cc3fd2a6d..f6b63ee562 100644 --- a/ode/unit_test/Test_ODE_Newton.hpp +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -525,29 +525,29 @@ void test_newton_on_device() { // No ETI is performed for these device routines // Just pick scalar types at will... TEST_F(TestCategory, Newton_status_float) { - ::Test::test_newton_status(); + ::Test::test_newton_status(); } TEST_F(TestCategory, Newton_status_double) { - ::Test::test_newton_status(); + ::Test::test_newton_status(); } TEST_F(TestCategory, Newton_simple_float) { - ::Test::test_simple_problems(); + ::Test::test_simple_problems(); } TEST_F(TestCategory, Newton_simple_double) { - ::Test::test_simple_problems(); + ::Test::test_simple_problems(); } TEST_F(TestCategory, Newton_system_float) { - ::Test::test_simple_systems(); + ::Test::test_simple_systems(); } TEST_F(TestCategory, Newton_system_double) { - ::Test::test_simple_systems(); + ::Test::test_simple_systems(); } TEST_F(TestCategory, Newton_parallel_float) { - ::Test::test_newton_on_device(); + ::Test::test_newton_on_device(); } TEST_F(TestCategory, Newton_parallel_double) { - ::Test::test_newton_on_device(); + ::Test::test_newton_on_device(); } diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 039e0211eb..c7d1a84865 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -561,11 +561,11 @@ void test_adaptivity() { } // namespace Test -void test_RK() { Test::test_RK(); } +void test_RK() { Test::test_RK(); } -void test_RK_conv_rate() { Test::test_convergence_rate(); } +void test_RK_conv_rate() { Test::test_convergence_rate(); } -void test_RK_adaptivity() { Test::test_adaptivity(); } +void test_RK_adaptivity() { Test::test_adaptivity(); } #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, RKSolve_serial) { test_RK(); } diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 763394e1ec..763f38a013 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -189,7 +189,7 @@ void test_chem() { } // namespace Test int test_chem_models() { - Test::test_chem(); + Test::test_chem(); return 1; } diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp index 9fc533ed9c..b6301778a3 100644 --- a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp +++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp @@ -564,35 +564,35 @@ void test_merge_matrix() { TEST_F(TestCategory, common_merge_matrix) { // clang-format off - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); // test some select integer / float combos - Test_Sparse_MergeMatrix::test_merge_matrix(); - Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); // no generally safe way to compare all possible values of these types - // Test_Sparse_MergeMatrix::test_merge_matrix(); - // Test_Sparse_MergeMatrix::test_merge_matrix(); - // Test_Sparse_MergeMatrix::test_merge_matrix(); + // Test_Sparse_MergeMatrix::test_merge_matrix(); + // Test_Sparse_MergeMatrix::test_merge_matrix(); + // Test_Sparse_MergeMatrix::test_merge_matrix(); // clang-format on } diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index fe092461f3..c06509b3ec 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -506,14 +506,14 @@ TEST_F(TestCategory, common_sort_crsgraph) { // because the exec space type is determined from the graph. if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; - testSortCRS(10, 10, 20, false, doStructInterface, - howExecSpecified); - testSortCRS(100, 100, 2000, false, doStructInterface, - howExecSpecified); - testSortCRS(1000, 1000, 30000, false, doStructInterface, - howExecSpecified); + testSortCRS(10, 10, 20, false, doStructInterface, + howExecSpecified); + testSortCRS(100, 100, 2000, false, doStructInterface, + howExecSpecified); + testSortCRS(1000, 1000, 30000, false, doStructInterface, + howExecSpecified); } - testSortCRSUnmanaged(false, doStructInterface); + testSortCRSUnmanaged(false, doStructInterface); } } @@ -525,24 +525,24 @@ TEST_F(TestCategory, common_sort_crsmatrix) { // because the exec space type is determined from the matrix. if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; - testSortCRS(10, 10, 20, true, doStructInterface, - howExecSpecified); - testSortCRS(100, 100, 2000, true, doStructInterface, - howExecSpecified); - testSortCRS(1000, 1000, 30000, true, doStructInterface, - howExecSpecified); + testSortCRS(10, 10, 20, true, doStructInterface, + howExecSpecified); + testSortCRS(100, 100, 2000, true, doStructInterface, + howExecSpecified); + testSortCRS(1000, 1000, 30000, true, doStructInterface, + howExecSpecified); } - testSortCRSUnmanaged(true, doStructInterface); + testSortCRSUnmanaged(true, doStructInterface); } } TEST_F(TestCategory, common_sort_crs_longrows) { // Matrix/graph with one very long row // Just test this once with graph, and once with matrix - testSortCRS(1, 50000, 10000, false, false, - SortCrsTest::ImplicitType); - testSortCRS(1, 50000, 10000, true, false, - SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, false, false, + SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, true, false, + SortCrsTest::ImplicitType); } TEST_F(TestCategory, common_sort_merge_crsmatrix) { @@ -555,8 +555,8 @@ TEST_F(TestCategory, common_sort_merge_crsmatrix) { howExecSpecified == SortCrsTest::ExplicitType) continue; if (doStructInterface && inPlace) continue; - testSortAndMerge(false, howExecSpecified, - doStructInterface, inPlace, testCase); + testSortAndMerge(false, howExecSpecified, + doStructInterface, inPlace, testCase); } } } @@ -573,8 +573,8 @@ TEST_F(TestCategory, common_sort_merge_crsgraph) { howExecSpecified == SortCrsTest::ExplicitType) continue; if (doStructInterface && inPlace) continue; - testSortAndMerge(true, howExecSpecified, - doStructInterface, inPlace, testCase); + testSortAndMerge(true, howExecSpecified, + doStructInterface, inPlace, testCase); } } } diff --git a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp index 279f4f89f9..029ddd14b0 100644 --- a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp +++ b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp @@ -75,12 +75,12 @@ void doAllCsMat(size_t m, size_t n) { // Test randomly generated Cs matrices TEST_F(TestCategory, sparse_randcsmat) { // Square cases - for (int dim = 1; dim < 1024; dim *= 4) doAllCsMat(dim, dim); + for (int dim = 1; dim < 1024; dim *= 4) doAllCsMat(dim, dim); // Non-square cases for (int dim = 1; dim < 1024; dim *= 4) { - doAllCsMat(dim * 3, dim); - doAllCsMat(dim, dim * 3); + doAllCsMat(dim * 3, dim); + doAllCsMat(dim, dim * 3); } } } // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_Transpose.hpp b/sparse/unit_test/Test_Sparse_Transpose.hpp index 35f7a0516c..05773b6b75 100644 --- a/sparse/unit_test/Test_Sparse_Transpose.hpp +++ b/sparse/unit_test/Test_Sparse_Transpose.hpp @@ -294,32 +294,32 @@ void testTransposeBsr(int numRows, int numCols, int blockSize) { TEST_F(TestCategory, sparse_transpose_matrix) { // Test both matrix and graph transpose with various sizes - testTranspose(100, 100, true); - testTranspose(500, 50, true); - testTranspose(50, 500, true); - testTranspose(4000, 2000, true); - testTranspose(2000, 4000, true); - testTranspose(2000, 2000, true); + testTranspose(100, 100, true); + testTranspose(500, 50, true); + testTranspose(50, 500, true); + testTranspose(4000, 2000, true); + testTranspose(2000, 4000, true); + testTranspose(2000, 2000, true); } TEST_F(TestCategory, sparse_transpose_graph) { - testTranspose(100, 100, false); - testTranspose(500, 50, false); - testTranspose(50, 500, false); - testTranspose(4000, 2000, false); - testTranspose(2000, 4000, false); - testTranspose(2000, 2000, false); + testTranspose(100, 100, false); + testTranspose(500, 50, false); + testTranspose(50, 500, false); + testTranspose(4000, 2000, false); + testTranspose(2000, 4000, false); + testTranspose(2000, 2000, false); } TEST_F(TestCategory, sparse_transpose_bsr_matrix) { - testTransposeBsrRef(); + testTransposeBsrRef(); // Test bsrMatrix transpose with various sizes - testTransposeBsr(100, 100, 3); - testTransposeBsr(500, 50, 5); - testTransposeBsr(50, 500, 16); - testTransposeBsr(4000, 2000, 3); - testTransposeBsr(2000, 4000, 3); - testTransposeBsr(2000, 2000, 5); + testTransposeBsr(100, 100, 3); + testTransposeBsr(500, 50, 5); + testTransposeBsr(50, 500, 16); + testTransposeBsr(4000, 2000, 3); + testTransposeBsr(2000, 4000, 3); + testTransposeBsr(2000, 2000, 5); } #endif diff --git a/sparse/unit_test/Test_Sparse_ccs2crs.hpp b/sparse/unit_test/Test_Sparse_ccs2crs.hpp index 56972b8a07..f7e2797759 100644 --- a/sparse/unit_test/Test_Sparse_ccs2crs.hpp +++ b/sparse/unit_test/Test_Sparse_ccs2crs.hpp @@ -136,19 +136,19 @@ TEST_F(TestCategory, sparse_ccs2crs) { std::srand(ticks); // Empty cases - doCcs2Crs(1, 0, 1, 10); - doCcs2Crs(0, 1, 1, 10); + doCcs2Crs(1, 0, 1, 10); + doCcs2Crs(0, 1, 1, 10); - doCcs2Crs(1, 0, 1, 10); - doCcs2Crs(0, 1, 1, 10); + doCcs2Crs(1, 0, 1, 10); + doCcs2Crs(0, 1, 1, 10); - doCcs2Crs(0, 0, 1, 10); - doCcs2Crs(0, 0, 1, 10); + doCcs2Crs(0, 0, 1, 10); + doCcs2Crs(0, 0, 1, 10); // Square cases for (size_t i = 4; i < 1024; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCcs2crs(dim, dim); + doAllCcs2crs(dim, dim); } // Non-square cases @@ -156,16 +156,16 @@ TEST_F(TestCategory, sparse_ccs2crs) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCcs2crs(m, n); + doAllCcs2crs(m, n); } // Fully sparse cases - doCcs2Crs(5, 5, 1, 10, true); - doCcs2Crs(50, 10, 10, 100, true); + doCcs2Crs(5, 5, 1, 10, true); + doCcs2Crs(50, 10, 10, 100, true); // Test the convenience wrapper that accepts a ccs matrix - RandCsMatrix csMat(2, 2, 10, 10, - false); + RandCsMatrix csMat(2, 2, 10, 10, + false); auto ccsMatrix = crs2ccs(csMat.get_dim1(), csMat.get_dim2(), csMat.get_nnz(), csMat.get_vals(), csMat.get_map(), csMat.get_ids()); auto crsMatrix = ccs2crs(ccsMatrix); diff --git a/sparse/unit_test/Test_Sparse_coo2crs.hpp b/sparse/unit_test/Test_Sparse_coo2crs.hpp index a2ccd6bc62..3427ec44cd 100644 --- a/sparse/unit_test/Test_Sparse_coo2crs.hpp +++ b/sparse/unit_test/Test_Sparse_coo2crs.hpp @@ -242,12 +242,12 @@ TEST_F(TestCategory, sparse_coo2crs) { UINT32_MAX; std::srand(ticks); - doAllCoo2Crs(0, 0); + doAllCoo2Crs(0, 0); // Square cases for (size_t i = 1; i < 256; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCoo2Crs(dim, dim); + doAllCoo2Crs(dim, dim); } // Non-square cases @@ -255,11 +255,11 @@ TEST_F(TestCategory, sparse_coo2crs) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCoo2Crs(m, n); + doAllCoo2Crs(m, n); } - RandCooMat cooMat(2, 2, 2 * 2, 10, - 10); + RandCooMat cooMat(2, 2, 2 * 2, 10, + 10); auto crsMatrix = KokkosSparse::coo2crs(2, 2, cooMat.get_row(), cooMat.get_col(), cooMat.get_data()); auto cooMatrix = KokkosSparse::crs2coo(crsMatrix); @@ -276,15 +276,15 @@ TEST_F(TestCategory, sparse_coo2crs_staticMatrix_edgeCases) { float staticData[16]{7.28411, 8.17991, 8.84304, 5.01788, 9.85646, 5.79404, 8.42014, 1.90238, 8.24195, 4.39955, 3.2637, 5.4546, 6.51895, 8.09302, 9.36294, 3.44206}; - Kokkos::View row("coo row", 16); - Kokkos::View col("coo col", 16); - Kokkos::View data("coo data", 16); + Kokkos::View row("coo row", 16); + Kokkos::View col("coo col", 16); + Kokkos::View data("coo data", 16); - typename Kokkos::View::HostMirror row_h = + typename Kokkos::View::HostMirror row_h = Kokkos::create_mirror_view(row); - typename Kokkos::View::HostMirror col_h = + typename Kokkos::View::HostMirror col_h = Kokkos::create_mirror_view(col); - typename Kokkos::View::HostMirror data_h = + typename Kokkos::View::HostMirror data_h = Kokkos::create_mirror_view(data); for (int i = 0; i < 16; i++) { row_h(i) = staticRow[i]; diff --git a/sparse/unit_test/Test_Sparse_crs2ccs.hpp b/sparse/unit_test/Test_Sparse_crs2ccs.hpp index 720c6cd05e..46cc2fb361 100644 --- a/sparse/unit_test/Test_Sparse_crs2ccs.hpp +++ b/sparse/unit_test/Test_Sparse_crs2ccs.hpp @@ -134,19 +134,19 @@ TEST_F(TestCategory, sparse_crs2ccs) { std::srand(ticks); // Empty cases - doCrs2Ccs(1, 0, 1, 10); - doCrs2Ccs(0, 1, 1, 10); + doCrs2Ccs(1, 0, 1, 10); + doCrs2Ccs(0, 1, 1, 10); - doCrs2Ccs(1, 0, 1, 10); - doCrs2Ccs(0, 1, 1, 10); + doCrs2Ccs(1, 0, 1, 10); + doCrs2Ccs(0, 1, 1, 10); - doCrs2Ccs(0, 0, 1, 10); - doCrs2Ccs(0, 0, 1, 10); + doCrs2Ccs(0, 0, 1, 10); + doCrs2Ccs(0, 0, 1, 10); // Square cases for (size_t i = 4; i < 1024; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCrs2Ccs(dim, dim); + doAllCrs2Ccs(dim, dim); } // Non-square cases @@ -154,16 +154,16 @@ TEST_F(TestCategory, sparse_crs2ccs) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCrs2Ccs(m, n); + doAllCrs2Ccs(m, n); } // Fully sparse cases - doCrs2Ccs(5, 5, 1, 10, true); - doCrs2Ccs(50, 10, 10, 100, true); + doCrs2Ccs(5, 5, 1, 10, true); + doCrs2Ccs(50, 10, 10, 100, true); // Test the convenience wrapper that accepts a crs matrix - RandCsMatrix csMat(2, 2, 10, 10, - false); + RandCsMatrix csMat(2, 2, 10, 10, + false); auto crsMatrix = ccs2crs(csMat.get_dim2(), csMat.get_dim1(), csMat.get_nnz(), csMat.get_vals(), csMat.get_map(), csMat.get_ids()); auto ccsMatrix = crs2ccs(crsMatrix); diff --git a/sparse/unit_test/Test_Sparse_crs2coo.hpp b/sparse/unit_test/Test_Sparse_crs2coo.hpp index 13ff60b0c8..9f81e20f90 100644 --- a/sparse/unit_test/Test_Sparse_crs2coo.hpp +++ b/sparse/unit_test/Test_Sparse_crs2coo.hpp @@ -128,7 +128,7 @@ TEST_F(TestCategory, sparse_crs2coo) { // Square cases for (size_t i = 1; i < 256; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCrs2Coo(dim, dim); + doAllCrs2Coo(dim, dim); } // Non-square cases @@ -136,7 +136,7 @@ TEST_F(TestCategory, sparse_crs2coo) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCrs2Coo(m, n); + doAllCrs2Coo(m, n); } } } // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_csc2csr.hpp b/sparse/unit_test/Test_Sparse_csc2csr.hpp index 61857a3e4f..aa838a4428 100644 --- a/sparse/unit_test/Test_Sparse_csc2csr.hpp +++ b/sparse/unit_test/Test_Sparse_csc2csr.hpp @@ -124,19 +124,19 @@ TEST_F(TestCategory, sparse_csc2csr) { std::srand(ticks); // Empty cases - doCsc2Csr(1, 0, 1, 10); - doCsc2Csr(0, 1, 1, 10); + doCsc2Csr(1, 0, 1, 10); + doCsc2Csr(0, 1, 1, 10); - doCsc2Csr(1, 0, 1, 10); - doCsc2Csr(0, 1, 1, 10); + doCsc2Csr(1, 0, 1, 10); + doCsc2Csr(0, 1, 1, 10); - doCsc2Csr(0, 0, 1, 10); - doCsc2Csr(0, 0, 1, 10); + doCsc2Csr(0, 0, 1, 10); + doCsc2Csr(0, 0, 1, 10); // Square cases for (size_t i = 4; i < 1024; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCsc2csr(dim, dim); + doAllCsc2csr(dim, dim); } // Non-square cases @@ -144,11 +144,11 @@ TEST_F(TestCategory, sparse_csc2csr) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCsc2csr(m, n); + doAllCsc2csr(m, n); } // Fully sparse cases - doCsc2Csr(5, 5, 1, 10, true); - doCsc2Csr(50, 10, 10, 100, true); + doCsc2Csr(5, 5, 1, 10, true); + doCsc2Csr(50, 10, 10, 100, true); } } // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_findRelOffset.hpp b/sparse/unit_test/Test_Sparse_findRelOffset.hpp index 9c7224b756..642f1666e7 100644 --- a/sparse/unit_test/Test_Sparse_findRelOffset.hpp +++ b/sparse/unit_test/Test_Sparse_findRelOffset.hpp @@ -430,13 +430,13 @@ void test_findRelOffset() { #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #undef EXECUTE_TEST diff --git a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp index bdd175558f..52a9a1874b 100644 --- a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp +++ b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp @@ -224,8 +224,8 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { if (haveHardcodedReference) { Matrix Afiltered_refimpl = removeMatrixZerosReference(A); bool referenceImplMatchesHardcoded = - Test::is_same_matrix(Afiltered_ref, - Afiltered_refimpl); + Test::is_same_matrix(Afiltered_ref, + Afiltered_refimpl); ASSERT_TRUE(referenceImplMatchesHardcoded) << "Test case " << test << ": reference impl gave wrong answer!"; } @@ -235,14 +235,13 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { void testRemoveCrsMatrixZeros(int testCase) { using namespace TestRemoveCrsMatrixZeros; - using Matrix = - KokkosSparse::CrsMatrix; + using Matrix = KokkosSparse::CrsMatrix; Matrix A, Afiltered_ref; getTestInput(testCase, A, Afiltered_ref); Matrix Afiltered_actual = KokkosSparse::removeCrsMatrixZeros(A); - bool matches = Test::is_same_matrix(Afiltered_actual, - Afiltered_ref); + bool matches = + Test::is_same_matrix(Afiltered_actual, Afiltered_ref); EXPECT_TRUE(matches) << "Test case " << testCase << ": matrix with zeros filtered out does not match reference."; diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index ca5c5a22f4..8c9c72017a 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -1321,12 +1321,12 @@ void test_spmv_all_interfaces_light() { #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_ISSUE_101(TestExecSpace) +EXECUTE_TEST_ISSUE_101(TestDevice) #endif #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ - EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace) + EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestDevice) \ + EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestDevice) #include @@ -1336,10 +1336,10 @@ EXECUTE_TEST_ISSUE_101(TestExecSpace) (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ - EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ - EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) \ + EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) \ + EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) #include @@ -1351,9 +1351,9 @@ EXECUTE_TEST_ISSUE_101(TestExecSpace) (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) \ - EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) \ + EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) #include diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index 839421d916..164c87b8df 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -744,8 +744,7 @@ void test_spm_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, \ - TestExecSpace) + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) #include @@ -756,8 +755,7 @@ void test_spm_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \ - TestExecSpace) + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) #include diff --git a/sparse/unit_test/Test_Sparse_trsv.hpp b/sparse/unit_test/Test_Sparse_trsv.hpp index e6bc13d6a0..d580cc472d 100644 --- a/sparse/unit_test/Test_Sparse_trsv.hpp +++ b/sparse/unit_test/Test_Sparse_trsv.hpp @@ -139,7 +139,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) #include @@ -152,7 +152,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) #include diff --git a/test_common/Test_Common_Test_All_Type_Combos.hpp b/test_common/Test_Common_Test_All_Type_Combos.hpp index c51601fdf4..a51d796632 100644 --- a/test_common/Test_Common_Test_All_Type_Combos.hpp +++ b/test_common/Test_Common_Test_All_Type_Combos.hpp @@ -31,26 +31,25 @@ // ETI is off, test all possible type combos -KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestDevice) #if !defined(NO_TEST_COMPLEX) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestDevice) #endif @@ -61,49 +60,49 @@ KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestDevice) #endif #if !defined(NO_TEST_COMPLEX) @@ -111,50 +110,49 @@ KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestDevice) #endif #endif // !NO_TEST_COMPLEX diff --git a/test_common/Test_Cuda.hpp b/test_common/Test_Cuda.hpp index 9230d7f935..01eb78dfef 100644 --- a/test_common/Test_Cuda.hpp +++ b/test_common/Test_Cuda.hpp @@ -39,9 +39,9 @@ using CudaUVMSpaceDevice = Kokkos::Device; // Prefer for any testing where only one exec space is used #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) && \ !defined(KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE) -#define TestExecSpace CudaUVMSpaceDevice +#define TestDevice CudaUVMSpaceDevice #else -#define TestExecSpace CudaSpaceDevice +#define TestDevice CudaSpaceDevice #endif #endif // TEST_CUDA_HPP diff --git a/test_common/Test_HIP.hpp b/test_common/Test_HIP.hpp index 7e61bfc9c3..6d619d1378 100644 --- a/test_common/Test_HIP.hpp +++ b/test_common/Test_HIP.hpp @@ -32,6 +32,6 @@ class hip : public ::testing::Test { }; #define TestCategory hip -#define TestExecSpace Kokkos::Experimental::HIP +#define TestDevice Kokkos::Experimental::HIP #endif // TEST_HIP_HPP diff --git a/test_common/Test_OpenMP.hpp b/test_common/Test_OpenMP.hpp index 8b4f90730e..3d110e4479 100644 --- a/test_common/Test_OpenMP.hpp +++ b/test_common/Test_OpenMP.hpp @@ -32,6 +32,6 @@ class openmp : public ::testing::Test { }; #define TestCategory openmp -#define TestExecSpace Kokkos::OpenMP +#define TestDevice Kokkos::OpenMP #endif // TEST_OPENMP_HPP diff --git a/test_common/Test_OpenMPTarget.hpp b/test_common/Test_OpenMPTarget.hpp index 2056d8be01..d41f95dad1 100644 --- a/test_common/Test_OpenMPTarget.hpp +++ b/test_common/Test_OpenMPTarget.hpp @@ -32,6 +32,6 @@ class openmptarget : public ::testing::Test { }; #define TestCategory openmptarget -#define TestExecSpace Kokkos::Experimental::OpenMPTarget +#define TestDevice Kokkos::Experimental::OpenMPTarget #endif // TEST_OPENMPTARGET_HPP diff --git a/test_common/Test_SYCL.hpp b/test_common/Test_SYCL.hpp index c7022f35d1..493b8082a4 100644 --- a/test_common/Test_SYCL.hpp +++ b/test_common/Test_SYCL.hpp @@ -29,4 +29,4 @@ class sycl_test : public ::testing::Test { }; #define TestCategory sycl_test -#define TestExecSpace Kokkos::Experimental::SYCL +#define TestDevice Kokkos::Experimental::SYCL diff --git a/test_common/Test_Serial.hpp b/test_common/Test_Serial.hpp index fe2917937b..aca218cade 100644 --- a/test_common/Test_Serial.hpp +++ b/test_common/Test_Serial.hpp @@ -32,6 +32,6 @@ class serial : public ::testing::Test { }; #define TestCategory serial -#define TestExecSpace Kokkos::Serial +#define TestDevice Kokkos::Serial #endif // TEST_SERIAL_HPP diff --git a/test_common/Test_Threads.hpp b/test_common/Test_Threads.hpp index 1e2919b68f..d527023c8f 100644 --- a/test_common/Test_Threads.hpp +++ b/test_common/Test_Threads.hpp @@ -32,6 +32,6 @@ class threads : public ::testing::Test { }; #define TestCategory threads -#define TestExecSpace Kokkos::Threads +#define TestDevice Kokkos::Threads #endif // TEST_THREADS_HPP From e187fdf53af5f936363de1ee8bc2412fc15754e2 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 19 Sep 2023 15:28:25 -0600 Subject: [PATCH 178/231] Fix #1972 Fix some uses of execution_space in ODE test that I missed in #1969. Also, for as long as the Kokkos_ENABLE_CUDA_UVM option exists, check that KokkosKernels is actually instantiating for CudaUVMSpace (without this, tests will hit linker errors) --- ode/unit_test/Test_ODE_Newton.hpp | 23 ++++++++++------------- test_common/Test_Cuda.hpp | 13 ++++++++++++- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp index f6b63ee562..d235df1e56 100644 --- a/ode/unit_test/Test_ODE_Newton.hpp +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -158,10 +158,10 @@ struct QuadraticEquation { // f(x) = cos(x) - x = 0 // Solution: 0.739085 // f'(x) = -sin(x) - 1 -template +template struct TrigonometricEquation { - using vec_type = Kokkos::View; - using mat_type = Kokkos::View; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; static constexpr int neqs = 1; @@ -180,10 +180,10 @@ struct TrigonometricEquation { // f(x) = 7x - log(7x) - 1 = 0 // Solution: 1/7 = 0.14285714285 // f'(x) = 7 - (1 / x) -template +template struct LogarithmicEquation { - using vec_type = Kokkos::View; - using mat_type = Kokkos::View; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; static constexpr int neqs = 1; @@ -216,8 +216,7 @@ void test_newton_status() { throw std::runtime_error("scalar_type is neither float, nor double!"); } KokkosODE::Experimental::Newton_params params(50, abs_tol, rel_tol); - Kokkos::View status( - "newton solver status", 1); + Kokkos::View status("newton solver status", 1); auto status_h = Kokkos::create_mirror_view(status); // Create the non-linear system and initialize data @@ -262,7 +261,6 @@ void test_newton_status() { template void test_simple_problems() { - using execution_space = typename Device::execution_space; double abs_tol, rel_tol; if (std::is_same_v) { rel_tol = 10e-5; @@ -299,7 +297,7 @@ void test_simple_problems() { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "\nStarting Trigonometric Equation problem" << std::endl; #endif - using system_type = TrigonometricEquation; + using system_type = TrigonometricEquation; system_type mySys{}; scalar_type initial_value[1] = {0.1}, solution[1] = {0.739085}; run_newton_test(mySys, params, @@ -314,7 +312,7 @@ void test_simple_problems() { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "\nStarting Logarithmic Equation problem" << std::endl; #endif - using system_type = LogarithmicEquation; + using system_type = LogarithmicEquation; system_type mySys{}; scalar_type initial_value[1] = {static_cast(0.5)}, solution[1] = {static_cast(1.0) / @@ -400,7 +398,6 @@ struct CircleHyperbolaIntersection { template void test_simple_systems() { - using execution_space = typename Device::execution_space; double abs_tol, rel_tol; if (std::is_same_v) { rel_tol = 10e-5; @@ -418,7 +415,7 @@ void test_simple_systems() { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "\nStarting Circles Intersetcion problem" << std::endl; #endif - using system_type = CirclesIntersections; + using system_type = CirclesIntersections; system_type mySys{}; scalar_type initial_values[2] = {1.5, 1.5}; scalar_type solution[2] = {10.75 / 6, 0.8887803753}; diff --git a/test_common/Test_Cuda.hpp b/test_common/Test_Cuda.hpp index 01eb78dfef..cf1042a2c4 100644 --- a/test_common/Test_Cuda.hpp +++ b/test_common/Test_Cuda.hpp @@ -36,8 +36,19 @@ class Cuda : public ::testing::Test { using CudaSpaceDevice = Kokkos::Device; using CudaUVMSpaceDevice = Kokkos::Device; +#ifdef KOKKOS_ENABLE_CUDA_UVM +// KOKKOS_ENABLE_CUDA_UVM macro and cmake option is deprecated +// But if it is defined, test with CudaUVMSpace. +// Make sure it's instantiated first: +#if defined(KOKKOSKERNELS_TEST_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +#error \ + "Deprecated option KOKKOS_ENABLE_CUDA_UVM is defined, so KokkosKernels will test with CudaUVMSpace. " \ + "KokkosKernels_INST_MEMSPACE_CUDAUVMSPACE=ON must be set in configuration." +#endif +#define TestDevice CudaUVMSpaceDevice // Prefer for any testing where only one exec space is used -#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) && \ +#elif defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) && \ !defined(KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE) #define TestDevice CudaUVMSpaceDevice #else From c5cb0190352e95d839aea605ab79611882c04423 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 28 Jul 2023 11:54:53 -0600 Subject: [PATCH 179/231] docs: set GENERATE_HTML = YES in Doxyfile --- docs/Doxyfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 954b6b669b..43e98e5d21 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -1186,7 +1186,7 @@ IGNORE_PREFIX = # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. -GENERATE_HTML = NO +GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of From 09acb51ae7722aaa36d4cd084fd0a7099ce00e94 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 19 Sep 2023 16:01:27 -0600 Subject: [PATCH 180/231] docs: KokkosBlase1_mult.hpp: remove incorrect return doc --- blas/src/KokkosBlas1_mult.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index f390b3556a..32ede3090c 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -137,8 +137,6 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, /// \param alpha [in] The scalar to apply to A. /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. -/// -/// \return Y = gamma * Y + alpha * A * X. template void mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) { From 62a2d729b63835bb6019c4786eb99135eb1e9287 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 19 Sep 2023 16:28:38 -0600 Subject: [PATCH 181/231] docs: don't require latex for formulas --- docs/Doxyfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 43e98e5d21..2b23557a81 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -1643,7 +1643,7 @@ FORMULA_MACROFILE = # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -USE_MATHJAX = NO +USE_MATHJAX = YES # With MATHJAX_VERSION it is possible to specify the MathJax version to be used. # Note that the different versions of MathJax have different requirements with From 70e391faafa1788324101657ca59683d31d8bd1e Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 20 Sep 2023 14:27:21 -0600 Subject: [PATCH 182/231] Parallel prefix sum can infer view type Switch the template parameter order so that the View type can be inferred. --- common/src/KokkosKernels_SimpleUtils.hpp | 12 ++++----- common/src/KokkosKernels_Utils.hpp | 6 ++--- common/unit_test/Test_Common_Sorting.hpp | 2 +- .../KokkosSparse_par_ilut_numeric_impl.hpp | 4 +-- .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 9 +++---- .../KokkosSparse_spgemm_impl_symbolic.hpp | 6 ++--- .../KokkosSparse_spgemm_impl_triangle.hpp | 3 +-- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 12 +++------ sparse/src/KokkosSparse_SortCrs.hpp | 6 ++--- sparse/src/KokkosSparse_Utils.hpp | 27 +++++++------------ 10 files changed, 32 insertions(+), 55 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index e25ec54eb0..3d50630336 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -84,7 +84,7 @@ struct InclusiveParallelPrefixSum { * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. */ -template +template inline void kk_exclusive_parallel_prefix_sum( const MyExecSpace &exec, typename view_t::value_type num_elements, view_t arr) { @@ -100,7 +100,7 @@ inline void kk_exclusive_parallel_prefix_sum( * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. */ -template +template inline void kk_exclusive_parallel_prefix_sum( typename view_t::value_type num_elements, view_t arr) { kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr); @@ -116,7 +116,7 @@ inline void kk_exclusive_parallel_prefix_sum( * \param finalSum: will be set to arr[num_elements - 1] after computing the * prefix sum. */ -template +template inline void kk_exclusive_parallel_prefix_sum( const MyExecSpace &exec, typename view_t::value_type num_elements, view_t arr, typename view_t::non_const_value_type &finalSum) { @@ -135,7 +135,7 @@ inline void kk_exclusive_parallel_prefix_sum( * \param finalSum: will be set to arr[num_elements - 1] after computing the * prefix sum. */ -template +template inline void kk_exclusive_parallel_prefix_sum( typename view_t::value_type num_elements, view_t arr, typename view_t::non_const_value_type &finalSum) { @@ -149,7 +149,7 @@ inline void kk_exclusive_parallel_prefix_sum( /// \param num_elements: size of the array /// \param arr: the array for which the prefix sum will be performed. /// -template +template void kk_inclusive_parallel_prefix_sum( MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, @@ -166,7 +166,7 @@ void kk_inclusive_parallel_prefix_sum( /// \param num_elements: size of the array /// \param arr: the array for which the prefix sum will be performed. /// -template +template void kk_inclusive_parallel_prefix_sum( typename forward_array_type::value_type num_elements, forward_array_type arr) { diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index c6780185a4..e1c15505ff 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -459,8 +459,7 @@ void inclusive_parallel_prefix_sum( MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, forward_array_type arr) { - return kk_inclusive_parallel_prefix_sum( - my_exec_space, num_elements, arr); + return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template @@ -475,8 +474,7 @@ template void exclusive_parallel_prefix_sum( typename forward_array_type::value_type num_elements, forward_array_type arr) { - kk_exclusive_parallel_prefix_sum( - num_elements, arr); + kk_exclusive_parallel_prefix_sum(num_elements, arr); } template diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index 51ecf228a8..45b5f9c4d2 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -47,7 +47,7 @@ size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, } Kokkos::deep_copy(randomCounts, countsHost); Kokkos::deep_copy(randomOffsets, randomCounts); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( n, randomOffsets); return total; } diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index c482aff429..0ac9c26166 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -60,8 +60,8 @@ struct IlutWrap { static size_type prefix_sum(RowMapType& row_map) { size_type result = 0; KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum< - RowMapType, typename IlutHandle::HandleExecSpace>(row_map.extent(0), - row_map, result); + typename IlutHandle::HandleExecSpace>(row_map.extent(0), row_map, + result); return result; } diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 62e074ff07..15132f9da3 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -523,8 +523,7 @@ void spadd_symbolic_impl( runSortedCountEntries( a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nrows + 1, c_rowmap); } else { // note: scoping individual parts of the process to free views sooner, @@ -542,8 +541,7 @@ void spadd_symbolic_impl( Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", range_type(0, nrows), countEntries); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nrows + 1, c_rowmap_upperbound); Kokkos::deep_copy(c_nnz_upperbound, Kokkos::subview(c_rowmap_upperbound, nrows)); @@ -585,8 +583,7 @@ void spadd_symbolic_impl( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries", range_type(0, nrows), mergeEntries); // compute actual c_rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nrows + 1, c_rowmap); } addHandle->set_a_b_pos(a_pos, b_pos); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 225525002c..778f33c378 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1672,8 +1672,7 @@ void KokkosSPGEMM( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( m + 1, rowmapC, c_nnz_size); this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size); nnz_lno_t c_max_nnz = @@ -2188,8 +2187,7 @@ void KokkosSPGEMM< } #endif typename c_row_view_t::non_const_value_type c_nnz_size = 0; - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( m + 1, rowmapC, c_nnz_size); this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size); nnz_lno_t c_max_nnz = diff --git a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index dd1a7cd9b5..80d2fc1c04 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1818,8 +1818,7 @@ void KokkosSPGEMM( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( this->a_row_cnt + 1, rowmapC_); MyExecSpace().fence(); diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 00fdcd2442..f1f7a0e6cd 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -633,22 +633,18 @@ class TwostageGaussSeidel { // shift ptr so that it now contains offsets (combine it with the previous // functor calls?) if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewL); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewLa); } } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewU); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewUa); } } diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 107923797a..c26ace9c69 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -634,8 +634,7 @@ void sort_and_merge_matrix(const exec_space& exec, auto entries_orig = entries_in; auto values_orig = values_in; // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( exec, numRows + 1, nc_rowmap_out); rowmap_out = nc_rowmap_out; entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, @@ -761,8 +760,7 @@ void sort_and_merge_graph(const exec_space& exec, // In the case where the output rowmap is the same as the input, we could just // assign "rowmap_out = rowmap_in" except that would break const-correctness. // Can skip filling the entries, however. - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( exec, numRows + 1, nc_rowmap_out); rowmap_out = nc_rowmap_out; entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 42d7df244f..f3fbec1836 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -412,8 +412,7 @@ void transpose_matrix( team_size, thread_size), tm); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -497,8 +496,7 @@ void transpose_graph( team_size, thread_size), tm); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -802,8 +800,7 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(tmp_reverse_size + 1, tmp_color_xadj); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( tmp_reverse_size + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -838,8 +835,7 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(num_reverse_elements + 1, reverse_map_xadj); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_reverse_elements + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -1500,8 +1496,7 @@ crstmat_t kk_get_lower_triangle( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nr + 1, new_row_map); exec_space().fence(); @@ -1558,8 +1553,7 @@ crstmat_t kk_get_lower_crs_matrix( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nr + 1, new_row_map); exec_space().fence(); @@ -1612,8 +1606,7 @@ graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix, kk_get_lower_triangle_count( nr, ne, rowmap, entries, new_row_map.data(), new_indices); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nr + 1, new_row_map); exec_space().fence(); @@ -1666,8 +1659,7 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(), use_dynamic_scheduling, chunksize, is_lower); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, out_rowmap); exec_space().fence(); @@ -1775,8 +1767,7 @@ void kk_create_incidence_matrix_from_original_matrix( permutation.data(), use_dynamic_scheduling, chunksize, sort_decreasing_order); exec_space().fence(); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, out_rowmap); // kk_print_1Dview(out_rowmap, false, 20); From a61ec3d873a0d592453e0b65e02faaf3be7fcb66 Mon Sep 17 00:00:00 2001 From: dekken Date: Thu, 21 Sep 2023 18:56:46 +0200 Subject: [PATCH 183/231] small typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 58127b912e..0da1057870 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ For a complete list of tunable Kokkos options, run spack info kokkos ```` -#### Settuping a development environment with Spack +#### Setting up a development environment with Spack Spack is generally most useful for installng packages to use. If you want to install all *dependencies* of Kokkos Kernels first so that you can actively develop a given Kokkos Kernels source this can still be done. Go to the Kokkos Kernels source code folder and run: ```` From 685c1921a6de0b0329b1c64691443b0d284eae62 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Wed, 23 Aug 2023 14:50:29 -0500 Subject: [PATCH 184/231] TPL: revise BLAS1 nrm2 implementation --- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 45 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 397 +++++++++--------- 2 files changed, 232 insertions(+), 210 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index a58c90d8e9..7bc55becc0 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -57,33 +57,40 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ template <> \ struct nrm2_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +#endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 5e017cb7e1..62139d2b12 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -214,210 +214,225 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, #endif -// cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS #include namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDnrm2(s.handle, N, X.data(), int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ + 1, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSnrm2(s.handle, N, X.data(), int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasScnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ + Kokkos::Cuda, Kokkos::CudaSpace, cublasDznrm2, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(true) +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include + +namespace KokkosBlas { +namespace Impl { -#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDznrm2( \ - s.handle, N, reinterpret_cast(X.data()), \ - int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ROCBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ + 1, &R())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScnrm2( \ - s.handle, N, reinterpret_cast(X.data()), \ - int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_snrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_dnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_scnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_dznrm2, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(true) +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const std::int64_t N = static_cast(numElems); \ + TPL_NRM2(space.sycl_queue(), N, \ + reinterpret_cast(X.data()), 1, &R()); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(true) +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(false) } // namespace Impl } // namespace KokkosBlas From 1fc3352b90753bdd71894544fd55516998f599e3 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 21 Sep 2023 13:29:13 -0600 Subject: [PATCH 185/231] Sparse: marking ordinal traits function constexpr --- sparse/src/KokkosSparse_OrdinalTraits.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sparse/src/KokkosSparse_OrdinalTraits.hpp b/sparse/src/KokkosSparse_OrdinalTraits.hpp index 6d76460939..b48fa6f6b3 100644 --- a/sparse/src/KokkosSparse_OrdinalTraits.hpp +++ b/sparse/src/KokkosSparse_OrdinalTraits.hpp @@ -55,44 +55,44 @@ struct OrdinalTraits { template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION short int invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION short int invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned short int invalid() { + static constexpr KOKKOS_INLINE_FUNCTION unsigned short int invalid() { return USHRT_MAX; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION int invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION int invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned int invalid() { return UINT_MAX; } + static constexpr KOKKOS_INLINE_FUNCTION unsigned int invalid() { return UINT_MAX; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION long invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION long invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned long invalid() { return ULONG_MAX; } + static constexpr KOKKOS_INLINE_FUNCTION unsigned long invalid() { return ULONG_MAX; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION long long invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION long long invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned long long invalid() { + static constexpr KOKKOS_INLINE_FUNCTION unsigned long long invalid() { return ULLONG_MAX; } }; From 64fe2f34ebaadc0d2709327ae4abf1330ac9130f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 21 Sep 2023 13:31:34 -0600 Subject: [PATCH 186/231] Sparse: Ordinal Traits, applying clang-format. --- sparse/src/KokkosSparse_OrdinalTraits.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sparse/src/KokkosSparse_OrdinalTraits.hpp b/sparse/src/KokkosSparse_OrdinalTraits.hpp index b48fa6f6b3..8a487de030 100644 --- a/sparse/src/KokkosSparse_OrdinalTraits.hpp +++ b/sparse/src/KokkosSparse_OrdinalTraits.hpp @@ -72,7 +72,9 @@ struct OrdinalTraits { template <> struct OrdinalTraits { - static constexpr KOKKOS_INLINE_FUNCTION unsigned int invalid() { return UINT_MAX; } + static constexpr KOKKOS_INLINE_FUNCTION unsigned int invalid() { + return UINT_MAX; + } }; template <> @@ -82,7 +84,9 @@ struct OrdinalTraits { template <> struct OrdinalTraits { - static constexpr KOKKOS_INLINE_FUNCTION unsigned long invalid() { return ULONG_MAX; } + static constexpr KOKKOS_INLINE_FUNCTION unsigned long invalid() { + return ULONG_MAX; + } }; template <> From dc901212af3263aa8c13312995a20ab4c3d9a447 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 22 Sep 2023 14:24:35 -0600 Subject: [PATCH 187/231] KokkosKernels: switching from printf macro to function Kokkos Core is about to remove the KOKKOS_PRINTF macro in favor of Kokkos::printf function. To prepare for this we are guarding the old macro to no longer use it when compiling against newer versions of Kokkos Core. --- .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 42 +++ .../dense/impl/KokkosBatched_Copy_Impl.hpp | 50 +++ .../dense/impl/KokkosBatched_Dot_Internal.hpp | 93 ++++++ .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 75 +++++ .../KokkosBatched_HadamardProduct_Impl.hpp | 48 +++ .../dense/impl/KokkosBatched_Xpay_Impl.hpp | 42 +++ .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 78 +++++ .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 78 +++++ .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 78 +++++ .../sparse/src/KokkosBatched_JacobiPrec.hpp | 16 +- blas/src/KokkosBlas1_nrm2.hpp | 7 + blas/unit_test/Test_Blas2_ger.hpp | 122 ++++++++ blas/unit_test/Test_Blas2_syr.hpp | 98 ++++++ common/src/KokkosKernels_Error.hpp | 11 + common/src/KokkosKernels_SimpleUtils.hpp | 8 + common/unit_test/Test_Common_ArithTraits.hpp | 290 ++++++++++++++++++ common/unit_test/Test_Common_LowerBound.hpp | 12 + common/unit_test/Test_Common_UpperBound.hpp | 12 + ode/impl/KokkosODE_Newton_impl.hpp | 5 + sparse/src/KokkosSparse_spmv_team.hpp | 28 ++ sparse/unit_test/Test_Sparse_spmv.hpp | 13 + 21 files changed, 1205 insertions(+), 1 deletion(-) diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index beaef112f3..400c46544d 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -199,17 +199,31 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -249,17 +263,31 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -304,17 +332,31 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index 2f0be4b661..5b693bb87a 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -59,10 +59,17 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#endif return 1; } #endif @@ -87,10 +94,17 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#endif return 1; } #endif @@ -143,12 +157,21 @@ struct TeamCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif @@ -181,12 +204,21 @@ struct TeamCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif @@ -245,12 +277,21 @@ struct TeamVectorCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif @@ -283,12 +324,21 @@ struct TeamVectorCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index a6a7673e7b..854069289e 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -186,19 +186,35 @@ struct SerialDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(1) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: Second dimension of X and alpha do not match: " + "X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -232,18 +248,33 @@ struct SerialDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: First dimension of X and alpha do not match: X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -282,19 +313,35 @@ struct TeamDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(1) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: Second dimension of X and alpha do not match: " + "X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -337,18 +384,33 @@ struct TeamDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: First dimension of X and alpha do not match: X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -395,19 +457,35 @@ struct TeamVectorDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(1) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: Second dimension of X and alpha do not match: " + "X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -450,18 +528,33 @@ struct TeamVectorDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: First dimension of X and alpha do not match: X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 0ef43ee4f8..e4e0d5b8b7 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -384,22 +384,39 @@ struct SerialGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " "%d x %d, tmp (note: its second dimension should be the second " "dimension of A + 4): %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), (int)tmp.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " + "%d x %d, tmp (note: its second dimension should be the second " + "dimension of A + 4): %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), + (int)tmp.extent(1)); +#endif return 1; } if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -414,9 +431,15 @@ struct SerialGesv { if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); +#else + Kokkos::printf( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); +#endif return 1; } @@ -458,11 +481,19 @@ struct SerialGesv { if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -509,11 +540,19 @@ struct TeamGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -532,9 +571,15 @@ struct TeamGesv { if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); +#else + Kokkos::printf( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); +#endif return 1; } member.team_barrier(); @@ -587,11 +632,19 @@ struct TeamGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -645,11 +698,19 @@ struct TeamVectorGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -668,9 +729,15 @@ struct TeamVectorGesv { if (TeamVectorStaticPivoting::invoke( member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); +#else + Kokkos::printf( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); +#endif return 1; } @@ -724,11 +791,19 @@ struct TeamVectorGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index ebd789c2e8..0570bc4ccc 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -110,19 +110,35 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " + "X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " + "X: %d x %d, " + "V: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#endif return 1; } #endif @@ -161,19 +177,35 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " + "X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " + "X: %d x %d, " + "V: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#endif return 1; } #endif @@ -214,19 +246,35 @@ KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " + "X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " + "X: %d x %d, " + "V: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index 4f90c0be38..5e5b7e13cc 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -204,17 +204,31 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -247,17 +261,31 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -291,17 +319,31 @@ KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index b7527d923c..b96dc79a80 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -153,49 +153,95 @@ struct SerialSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and alpha do not match: " + "X: %d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } if (X.extent(0) != beta.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and beta do not match: X: " + "%d x %d, beta: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif @@ -243,35 +289,67 @@ struct SerialSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 2b62be1e5a..d7379777be 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -341,49 +341,95 @@ struct TeamVectorSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and alpha do not match: " + "X: %d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } if (X.extent(0) != beta.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and beta do not match: X: " + "%d x %d, beta: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif @@ -438,35 +484,67 @@ struct TeamVectorSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index c46ef7edc7..beb53521f0 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -192,49 +192,95 @@ struct TeamSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and alpha do not match: " + "X: %d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } if (X.extent(0) != beta.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and beta do not match: X: " + "%d x %d, beta: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif @@ -289,35 +335,67 @@ struct TeamSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 728bb2d921..82a1291268 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -109,10 +109,17 @@ class JacobiPrec { } if (tooSmall > 0) +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " "magnitude and have been replaced by one, \n", (int)tooSmall); +#else + Kokkos::printf( + "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " + "magnitude and have been replaced by one, \n", + (int)tooSmall); +#endif computed_inverse = true; } @@ -131,10 +138,17 @@ class JacobiPrec { } if (tooSmall > 0) +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " "magnitude and have been replaced by one, \n", (int)tooSmall); +#else + Kokkos::printf( + "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " + "magnitude and have been replaced by one, \n", + (int)tooSmall); +#endif computed_inverse = true; } @@ -168,4 +182,4 @@ class JacobiPrec { } // namespace KokkosBatched -#endif \ No newline at end of file +#endif diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index 67cdde17fa..64643367a0 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -241,10 +241,17 @@ KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { " Kokkos::ArithTraits::mag_type"); if (R.extent(0) != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," " R: %d and X: %d x %d.\n", R.extent_int(0), X.extent_int(0), X.extent_int(1)); +#else + Kokkos::printf( + "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," + " R: %d and X: %d x %d.\n", + R.extent_int(0), X.extent_int(0), X.extent_int(1)); +#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index abafd79ac9..8cbe903d97 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -292,9 +292,15 @@ void GerTester h_vanilla( "vanilla = A + alpha * x * y^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); +#else + Kokkos::printf( + "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); +#endif #endif this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); @@ -1357,10 +1363,17 @@ void GerTester #ifdef HAVE_KOKKOSKERNELS_DEBUG int test_ger(const std::string& caseName) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); +#else + Kokkos::printf( + "+=======================================================================" + "===\n"); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", caseName.c_str(), typeid(Device).name()); +#else + Kokkos::printf("Starting %s, device = %s ...\n", + caseName.c_str(), typeid(Device).name()); +#endif #else int test_ger(const std::string& /*caseName*/) { #endif @@ -1428,11 +1452,22 @@ int test_ger(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); +#else + Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", + caseName.c_str()); +#endif #endif if (true) { Test::GerTester::test( view_stride_adapter<_ViewTypeExpected, true> h_vanilla( "vanilla = A + alpha * x * x^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); +#else + Kokkos::printf( + "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); +#endif #endif this->populateVanillaValues(alpha, x.h_view, A.h_view, h_vanilla.d_view); @@ -1434,10 +1440,17 @@ void SyrTester:: #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); +#else + Kokkos::printf( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " + "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); +#endif #endif std::string mode = _useHermitianOption ? "H" : "T"; std::string uplo = _useUpOption ? "U" : "L"; @@ -1491,10 +1504,17 @@ void SyrTester:: #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException); +#else + Kokkos::printf( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " + "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkGerShouldThrowException); +#endif #endif std::string mode = _useHermitianOption ? "H" : "T"; bool gotStdException(false); @@ -1561,10 +1581,17 @@ void SyrTester:: template #ifdef HAVE_KOKKOSKERNELS_DEBUG int test_syr(const std::string& caseName) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); +#else + Kokkos::printf( + "+=======================================================================" + "===\n"); + Kokkos::printf("Starting %s ...\n", caseName.c_str()); +#endif #else int test_syr(const std::string& /*caseName*/) { #endif @@ -1582,11 +1609,19 @@ int test_syr(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); + Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", + caseName.c_str()); +#endif #endif if (true) { Test::SyrTester mag_type(eps)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " "(eps = %e)\n", (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), KAT::imag(view2(i)), eps); +#else + Kokkos::printf( + "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " + "(eps = %e)\n", + (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), + KAT::imag(view2(i)), eps); +#endif num_diffs++; } } diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 37bb8fce80..2109e03b82 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -35,18 +35,33 @@ #include // typeid (T) #include +#if KOKKOS_VERSION < 40199 #define FAILURE() \ { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%s:%d: Failure\n", __FILE__, __func__, \ __LINE__); \ success = 0; \ } +#else +#define FAILURE() \ + { \ + Kokkos::printf("%s:%s:%d: Failure\n", __FILE__, __func__, \ + __LINE__); \ + success = 0; \ + } +#endif #if 0 +#if KOKKOS_VERSION < 40199 #define TRACE() \ KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%s:%d: Trace\n", __FILE__, __func__, \ __LINE__); #else +#define TRACE() \ + Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, \ + __LINE__); +#endif +#else #define TRACE() #endif @@ -181,7 +196,11 @@ class ArithTraitsTesterBase { // T, but we check for this int constant for compatibility with // std::numeric_limits. if (!AT::is_specialized) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT::is_specialized\n"); +#else + Kokkos::printf("! AT::is_specialized\n"); +#endif FAILURE(); } @@ -189,13 +208,23 @@ class ArithTraitsTesterBase { // function, just not to its class methods (which are not marked // as device functions). if (AT::is_integer != std::numeric_limits::is_integer) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_integer not same as numeric_limits\n"); +#else + Kokkos::printf( + "AT::is_integer not same as numeric_limits\n"); +#endif FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_exact not same as numeric_limits\n"); +#else + Kokkos::printf( + "AT::is_exact not same as numeric_limits\n"); +#endif FAILURE(); } @@ -204,34 +233,62 @@ class ArithTraitsTesterBase { // Test properties of the arithmetic and multiplicative identities. if (zero + zero != zero) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 + 0 != 0\n"); +#else + Kokkos::printf("0 + 0 != 0\n"); +#endif FAILURE(); } if (zero + one != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 + 1 != 1\n"); +#else + Kokkos::printf("0 + 1 != 1\n"); +#endif FAILURE(); } if (one - one != zero) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 - 1 != 0\n"); +#else + Kokkos::printf("1 - 1 != 0\n"); +#endif FAILURE(); } // This is technically 1 even of Z_2, since in that field, one // is its own inverse (so -one == one). if ((one + one) - one != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("(1 + 1) - 1 != 1\n"); +#else + Kokkos::printf("(1 + 1) - 1 != 1\n"); +#endif FAILURE(); } if (AT::abs(zero) != zero) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) != 0\n"); +#else + Kokkos::printf("AT::abs(0) != 0\n"); +#endif FAILURE(); } if (AT::abs(one) != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(1) != 1\n"); +#else + Kokkos::printf("AT::abs(1) != 1\n"); +#endif FAILURE(); } if (AT::is_signed && AT::abs(-one) != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::is_signed and AT::abs(-1) != 1\n"); +#else + Kokkos::printf("AT::is_signed and AT::abs(-1) != 1\n"); +#endif FAILURE(); } // Need enable_if to test whether T can be compared using <=. @@ -240,7 +297,11 @@ class ArithTraitsTesterBase { // These are very mild ordering properties. // They should work even for a set only containing zero. if (AT::abs(zero) > AT::abs(AT::max())) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) > AT::abs (AT::max ())\n"); +#else + Kokkos::printf("AT::abs(0) > AT::abs (AT::max ())\n"); +#endif FAILURE(); } @@ -553,20 +614,36 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(two, three); if (!equal(result, eight)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(2,3) != 8\n"); +#else + Kokkos::printf("AT::pow(2,3) != 8\n"); +#endif FAILURE(); } } if (!equal(AT::pow(three, zero), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,0) != 1\n"); +#else + Kokkos::printf("AT::pow(3,0) != 1\n"); +#endif FAILURE(); } if (!equal(AT::pow(three, one), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,1) != 3\n"); +#else + Kokkos::printf("AT::pow(3,1) != 3\n"); +#endif FAILURE(); } if (!equal(AT::pow(three, two), nine)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,2) != 9\n"); +#else + Kokkos::printf("AT::pow(3,2) != 9\n"); +#endif FAILURE(); } @@ -574,7 +651,11 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(three, three); if (!equal(result, twentySeven)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,3) != 27\n"); +#else + Kokkos::printf("AT::pow(3,3) != 27\n"); +#endif FAILURE(); } } @@ -583,93 +664,170 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_signed && !AT::is_complex) { result = AT::pow(-three, one); if (!equal(result, -three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,1) != -3\n"); +#else + Kokkos::printf("AT::pow(-3,1) != -3\n"); +#endif FAILURE(); } result = AT::pow(-three, two); if (!equal(result, nine)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,2) != 9\n"); +#else + Kokkos::printf("AT::pow(-3,2) != 9\n"); +#endif FAILURE(); } result = AT::pow(-three, three); if (!equal(result, -twentySeven)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,3) != 27\n"); +#else + Kokkos::printf("AT::pow(-3,3) != 27\n"); +#endif FAILURE(); } } if (!equal(AT::sqrt(zero), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(0) != 0\n"); +#else + Kokkos::printf("AT::sqrt(0) != 0\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(one), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(1) != 1\n"); +#else + Kokkos::printf("AT::sqrt(1) != 1\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(thirtySix), six)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(36) != 6\n"); +#else + Kokkos::printf("AT::sqrt(36) != 6\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(sixtyFour), eight)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(64) != 8\n"); +#else + Kokkos::printf("AT::sqrt(64) != 8\n"); +#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::sqrt(fortyTwo), six)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:sqrt(42) != 6\n"); +#else + Kokkos::printf("AT:sqrt(42) != 6\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(oneTwentySeven), eleven)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(127) != 11\n"); +#else + Kokkos::printf("AT::sqrt(127) != 11\n"); +#endif FAILURE(); } } if (!equal(AT::cbrt(zero), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n"); +#else + Kokkos::printf("AT::cbrt(0) != 0\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(one), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n"); +#else + Kokkos::printf("AT::cbrt(1) != 1\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(twentySeven), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n"); +#else + Kokkos::printf("AT::cbrt(27) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(sixtyFour), four)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n"); +#else + Kokkos::printf("AT::cbrt(64) != 4\n"); +#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt(fortyTwo), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n"); +#else + Kokkos::printf("AT:cbrt(42) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(oneTwentySeven), five)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n"); +#else + Kokkos::printf("AT::cbrt(127) != 5\n"); +#endif FAILURE(); } } if (!equal(AT::exp(zero), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n"); +#else + Kokkos::printf("AT::cbrt(0) != 1\n"); +#endif FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); +#else + Kokkos::printf( + "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); +#endif FAILURE(); } } if (!equal(AT::log(one), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log(1) != 0\n"); +#else + Kokkos::printf("AT::log(1) != 0\n"); +#endif FAILURE(); } if (!equal(AT::log10(one), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log10(1) != 0\n"); +#else + Kokkos::printf("AT::log10(1) != 0\n"); +#endif FAILURE(); } @@ -678,13 +836,23 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } else { @@ -692,27 +860,49 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#else + Kokkos::printf( + "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf( + "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } if (!equal(AT::asin(AT::sin(one)), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(1)) != 1\n"); +#else + Kokkos::printf("AT::asin(sin(1)) != 1\n"); +#endif FAILURE(); } if (!equal(AT::acos(AT::cos(one)), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(1)) != 1\n"); +#else + Kokkos::printf("AT::acos(cos(1)) != 1\n"); +#endif FAILURE(); } if (!equal(AT::atan(AT::tan(one)), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(1)) != 1\n"); +#else + Kokkos::printf("AT::atan(tan(1)) != 1\n"); +#endif FAILURE(); } @@ -839,41 +1029,74 @@ class ArithTraitsTesterTranscendentalBase } if (!equal(AT::cbrt(zero), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n"); +#else + Kokkos::printf("AT::cbrt(0) != 0\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(one), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n"); +#else + Kokkos::printf("AT::cbrt(1) != 1\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(twentySeven), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n"); +#else + Kokkos::printf("AT::cbrt(27) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(sixtyFour), four)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n"); +#else + Kokkos::printf("AT::cbrt(64) != 4\n"); +#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt(fortyTwo), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n"); +#else + Kokkos::printf("AT:cbrt(42) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(oneTwentySeven), five)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n"); +#else + Kokkos::printf("AT::cbrt(127) != 5\n"); +#endif FAILURE(); } } if (!equal(AT::exp(zero), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n"); +#else + Kokkos::printf("AT::cbrt(0) != 1\n"); +#endif FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); +#else + Kokkos::printf( + "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); +#endif FAILURE(); } } @@ -891,13 +1114,23 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } else { @@ -905,27 +1138,49 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#else + Kokkos::printf( + "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf( + "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } if (!equal(AT::asin(AT::sin(three)), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(3)) != 3\n"); +#else + Kokkos::printf("AT::asin(sin(3)) != 3\n"); +#endif FAILURE(); } if (!equal(AT::acos(AT::cos(three)), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(3)) != 3\n"); +#else + Kokkos::printf("AT::acos(cos(3)) != 3\n"); +#endif FAILURE(); } if (!equal(AT::atan(AT::tan(three)), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(3)) != 3\n"); +#else + Kokkos::printf("AT::atan(tan(3)) != 3\n"); +#endif FAILURE(); } @@ -1017,10 +1272,17 @@ class ArithTraitsTesterComplexBase #else { if (AT::is_signed != std::numeric_limits::is_signed) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_signed = 0x%x, std::numeric_limits::is_signed " "= 0x%x\n", AT::is_signed, std::numeric_limits::is_signed); +#else + Kokkos::printf( + "AT::is_signed = 0x%x, std::numeric_limits::is_signed " + "= 0x%x\n", + AT::is_signed, std::numeric_limits::is_signed); +#endif FAILURE(); } } @@ -1233,12 +1495,20 @@ class ArithTraitsTesterFloatingPointBase int success = 1; if (AT::is_exact) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::is_exact is 1\n"); +#else + Kokkos::printf("AT::is_exact is 1\n"); +#endif FAILURE(); } if (!AT::isNan(AT::nan())) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("NaN is not NaN\n"); +#else + Kokkos::printf("NaN is not NaN\n"); +#endif FAILURE(); } @@ -1246,19 +1516,35 @@ class ArithTraitsTesterFloatingPointBase const ScalarType one = AT::one(); if (AT::isInf(zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is Inf\n"); +#else + Kokkos::printf("0 is Inf\n"); +#endif FAILURE(); } if (AT::isInf(one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is Inf\n"); +#else + Kokkos::printf("1 is Inf\n"); +#endif FAILURE(); } if (AT::isNan(zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); +#else + Kokkos::printf("0 is NaN\n"); +#endif FAILURE(); } if (AT::isNan(one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); +#else + Kokkos::printf("1 is NaN\n"); +#endif FAILURE(); } @@ -1352,7 +1638,11 @@ class ArithTraitsTesterFloatingPointBase int success = 1; if (!AT::is_exact) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT:is_exact\n"); +#else + Kokkos::printf("! AT:is_exact\n"); +#endif FAILURE(); } diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index 476a44abf4..2f34c2d2b0 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -43,9 +43,15 @@ struct ThreadLowerBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::lower_bound_thread(haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(i), + int(expected_), int(idx)); +#endif ++lerrCount; } } @@ -100,9 +106,15 @@ struct TeamLowerBoundFunctor { hv_size_type idx = KokkosKernels::lower_bound_team(handle, haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); +#endif ++lerrCount; } } diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index 9e431285fd..afff72c52a 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -43,9 +43,15 @@ struct ThreadUpperBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(i), + int(expected_), int(idx)); +#endif ++lerrCount; } } @@ -100,9 +106,15 @@ struct TeamUpperBoundFunctor { hv_size_type idx = KokkosKernels::upper_bound_team(handle, haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); +#endif ++lerrCount; } } diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp index f0cb90810e..1fc864d997 100644 --- a/ode/impl/KokkosODE_Newton_impl.hpp +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -74,8 +74,13 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( KokkosBlas::SerialScale::invoke(-1, update); if (linSolverStat == 1) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "NewtonFunctor: Linear solve gesv returned failure! \n"); +#else + Kokkos::printf( + "NewtonFunctor: Linear solve gesv returned failure! \n"); +#endif return newton_solver_status::LIN_SOLVE_FAIL; } diff --git a/sparse/src/KokkosSparse_spmv_team.hpp b/sparse/src/KokkosSparse_spmv_team.hpp index fb55a65420..5c9e843669 100644 --- a/sparse/src/KokkosSparse_spmv_team.hpp +++ b/sparse/src/KokkosSparse_spmv_team.hpp @@ -55,18 +55,32 @@ int KOKKOS_INLINE_FUNCTION team_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " "values: %d, colIndices: %d", (int)values.extent(0), (int)colIndices.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); +#endif return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL @@ -109,18 +123,32 @@ int KOKKOS_INLINE_FUNCTION team_vector_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " "values: %d, colIndices: %d", (int)values.extent(0), (int)colIndices.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); +#endif return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 8c9c72017a..ee467afb25 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -89,9 +89,15 @@ struct fSPMV { if (error > eps * max_val) { err++; +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); +#else + Kokkos::printf( + "expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, + AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); +#endif } } @@ -101,10 +107,17 @@ struct fSPMV { if (error > eps * max_val) { err++; +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", i, j, AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, eps * max_val); +#else + Kokkos::printf( + "expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", i, j, + AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, + eps * max_val); +#endif } } }; From 1d54addae11c2a1122acdcd905d30cfe2e77050a Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 22 Sep 2023 17:33:09 -0600 Subject: [PATCH 188/231] Add warmup to KokkosSparse_kk_spmv Fixes #1978. Makes a significant difference when timing cusparse spmv! --- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 285cf026b4..8fd536fa61 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -73,6 +73,17 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, // Benchmark auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0); auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0); + // Do 5 warm up calls (not timed) + for (int i = 0; i < 5; i++) { + if (num_vecs == 1) { + // run the rank-1 version + KokkosSparse::spmv(&mode, 1.0, A, x0, beta, y0); + } else { + // rank-2 + KokkosSparse::spmv(&mode, 1.0, A, x, beta, y); + } + Kokkos::DefaultExecutionSpace().fence(); + } Kokkos::Timer timer; for (int i = 0; i < loop; i++) { if (num_vecs == 1) { From 771b67cc93c2cae1380136f9c996fd638ac8bb9d Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 22 Sep 2023 17:45:20 -0600 Subject: [PATCH 189/231] kk_spmv perf test: print out correct matrix stats --- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 8fd536fa61..3f4893363a 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -56,6 +56,15 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, } numRows = A.numRows(); numCols = A.numCols(); + + std::cout << "A is " << numRows << "x" << numCols << ", with " << A.nnz() + << " nonzeros\n"; + std::cout << "SpMV mode " << mode << ", " << num_vecs + << " vectors, beta = " << beta << ", multivectors are "; + std::cout << (std::is_same_v ? "LayoutLeft" + : "LayoutRight"); + std::cout << '\n'; + mv_type x("X", numCols, num_vecs); mv_type y("Y", numRows, num_vecs); h_mv_type h_x = Kokkos::create_mirror_view(x); @@ -180,9 +189,6 @@ int main(int argc, char** argv) { Kokkos::initialize(argc, argv); - std::cout << size << " rows/cols, mode " << mode << ", " << num_vecs - << " vectors, beta = " << beta << ", layout " << layout << ": "; - if (layout == 'L') run_spmv(size, size, filename, loop, num_vecs, mode, beta); From 58f4f6040d9ac4fdd8e23fd1d6d7a6d8cec32b60 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 25 Sep 2023 08:39:00 -0600 Subject: [PATCH 190/231] Applying clang-format --- .../sparse/src/KokkosBatched_JacobiPrec.hpp | 2 +- blas/unit_test/Test_Blas2_ger.hpp | 28 ++++++---------- blas/unit_test/Test_Blas2_syr.hpp | 24 +++++--------- common/src/KokkosKernels_Error.hpp | 13 ++++---- common/unit_test/Test_Common_ArithTraits.hpp | 32 +++++++------------ common/unit_test/Test_Common_LowerBound.hpp | 10 +++--- common/unit_test/Test_Common_UpperBound.hpp | 10 +++--- ode/impl/KokkosODE_Newton_impl.hpp | 3 +- sparse/unit_test/Test_Sparse_spmv.hpp | 13 ++++---- 9 files changed, 52 insertions(+), 83 deletions(-) diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 82a1291268..44a982525d 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -115,7 +115,7 @@ class JacobiPrec { "magnitude and have been replaced by one, \n", (int)tooSmall); #else - Kokkos::printf( + Kokkos::printf( "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " "magnitude and have been replaced by one, \n", (int)tooSmall); diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 8cbe903d97..a0860bae04 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -1428,8 +1428,8 @@ int test_ger(const std::string& caseName) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", caseName.c_str(), typeid(Device).name()); #else - Kokkos::printf("Starting %s, device = %s ...\n", - caseName.c_str(), typeid(Device).name()); + Kokkos::printf("Starting %s, device = %s ...\n", caseName.c_str(), + typeid(Device).name()); #endif #else int test_ger(const std::string& /*caseName*/) { @@ -1465,8 +1465,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); #else - Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1502,8 +1501,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", caseName.c_str()); #else - Kokkos::printf("Finished %s for LAYOUTLEFT\n", - caseName.c_str()); + Kokkos::printf("Finished %s for LAYOUTLEFT\n", caseName.c_str()); #endif #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -1534,8 +1532,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); #else - Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1571,8 +1568,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); #else - Kokkos::printf("Finished %s for LAYOUTRIGHT\n", - caseName.c_str()); + Kokkos::printf("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); #endif #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -1602,8 +1598,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); #else - Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1636,8 +1631,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); #else - Kokkos::printf("Finished %s for LAYOUTSTRIDE\n", - caseName.c_str()); + Kokkos::printf("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); #endif #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -1667,8 +1661,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); #else - Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1696,8 +1689,7 @@ int test_ger(const std::string& /*caseName*/) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); #else - Kokkos::printf("Finished %s for MIXED LAYOUTS\n", - caseName.c_str()); + Kokkos::printf("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); #endif #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 3299f70e01..4396c81bb2 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -1619,8 +1619,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); - Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1659,8 +1658,7 @@ int test_syr(const std::string& /*caseName*/) { "+-----------------------------------------------------------------------" "---\n"); #else - Kokkos::printf("Finished %s for LAYOUTLEFT\n", - caseName.c_str()); + Kokkos::printf("Finished %s for LAYOUTLEFT\n", caseName.c_str()); Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); @@ -1682,8 +1680,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); - Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1722,8 +1719,7 @@ int test_syr(const std::string& /*caseName*/) { "+-----------------------------------------------------------------------" "---\n"); #else - Kokkos::printf("Finished %s for LAYOUTRIGHT\n", - caseName.c_str()); + Kokkos::printf("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); @@ -1745,8 +1741,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); - Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1785,8 +1780,7 @@ int test_syr(const std::string& /*caseName*/) { "+-----------------------------------------------------------------------" "---\n"); #else - Kokkos::printf("Finished %s for LAYOUTSTRIDE\n", - caseName.c_str()); + Kokkos::printf("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); @@ -1807,8 +1801,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); - Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", - caseName.c_str()); + Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); #endif #endif if (true) { @@ -1845,8 +1838,7 @@ int test_syr(const std::string& /*caseName*/) { "+-----------------------------------------------------------------------" "---\n"); #else - Kokkos::printf("Finished %s for MIXED LAYOUTS\n", - caseName.c_str()); + Kokkos::printf("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index a179df87cf..df8b21b8df 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -90,13 +90,12 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, } \ } while (0) #else -#define IMPL_KERNEL_THROW(condition, msg) \ - do { \ - if (!(condition)) { \ - Kokkos::printf("KERNEL CHECK FAILED:\n %s\n %s\n", \ - #condition, msg); \ - Kokkos::abort(""); \ - } \ +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if (!(condition)) { \ + Kokkos::printf("KERNEL CHECK FAILED:\n %s\n %s\n", #condition, msg); \ + Kokkos::abort(""); \ + } \ } while (0) #endif diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 2109e03b82..9ed9eea99d 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -43,11 +43,10 @@ success = 0; \ } #else -#define FAILURE() \ - { \ - Kokkos::printf("%s:%s:%d: Failure\n", __FILE__, __func__, \ - __LINE__); \ - success = 0; \ +#define FAILURE() \ + { \ + Kokkos::printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); \ + success = 0; \ } #endif @@ -57,9 +56,8 @@ KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%s:%d: Trace\n", __FILE__, __func__, \ __LINE__); #else -#define TRACE() \ - Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, \ - __LINE__); +#define TRACE() \ + Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); #endif #else #define TRACE() @@ -212,8 +210,7 @@ class ArithTraitsTesterBase { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_integer not same as numeric_limits\n"); #else - Kokkos::printf( - "AT::is_integer not same as numeric_limits\n"); + Kokkos::printf("AT::is_integer not same as numeric_limits\n"); #endif FAILURE(); } @@ -222,8 +219,7 @@ class ArithTraitsTesterBase { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_exact not same as numeric_limits\n"); #else - Kokkos::printf( - "AT::is_exact not same as numeric_limits\n"); + Kokkos::printf("AT::is_exact not same as numeric_limits\n"); #endif FAILURE(); } @@ -864,8 +860,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); #else - Kokkos::printf( - "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); + Kokkos::printf("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); #endif FAILURE(); } @@ -874,8 +869,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); #else - Kokkos::printf( - "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); + Kokkos::printf("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); #endif FAILURE(); } @@ -1142,8 +1136,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); #else - Kokkos::printf( - "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); + Kokkos::printf("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); #endif FAILURE(); } @@ -1152,8 +1145,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); #else - Kokkos::printf( - "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); + Kokkos::printf("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); #endif FAILURE(); } diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index 2f34c2d2b0..6ca28b8be1 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -48,9 +48,8 @@ struct ThreadLowerBoundFunctor { __FILE__, __LINE__, int(i), int(expected_), int(idx)); #else - Kokkos::printf("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(i), - int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, + __LINE__, int(i), int(expected_), int(idx)); #endif ++lerrCount; } @@ -111,9 +110,8 @@ struct TeamLowerBoundFunctor { __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); #else - Kokkos::printf("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(handle.team_rank()), - int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, + int(handle.team_rank()), int(expected_), int(idx)); #endif ++lerrCount; } diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index afff72c52a..113b76c3ad 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -48,9 +48,8 @@ struct ThreadUpperBoundFunctor { __FILE__, __LINE__, int(i), int(expected_), int(idx)); #else - Kokkos::printf("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(i), - int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, + __LINE__, int(i), int(expected_), int(idx)); #endif ++lerrCount; } @@ -111,9 +110,8 @@ struct TeamUpperBoundFunctor { __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); #else - Kokkos::printf("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(handle.team_rank()), - int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, + int(handle.team_rank()), int(expected_), int(idx)); #endif ++lerrCount; } diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp index 1fc864d997..d5000a74ab 100644 --- a/ode/impl/KokkosODE_Newton_impl.hpp +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -78,8 +78,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( KOKKOS_IMPL_DO_NOT_USE_PRINTF( "NewtonFunctor: Linear solve gesv returned failure! \n"); #else - Kokkos::printf( - "NewtonFunctor: Linear solve gesv returned failure! \n"); + Kokkos::printf("NewtonFunctor: Linear solve gesv returned failure! \n"); #endif return newton_solver_status::LIN_SOLVE_FAIL; } diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index ee467afb25..b6a64e4f6d 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -94,9 +94,9 @@ struct fSPMV { "expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); #else - Kokkos::printf( - "expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, - AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); + Kokkos::printf("expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, + AT::abs(expected_y(i)), i, AT::abs(y(i)), error, + eps * max_val); #endif } } @@ -113,10 +113,9 @@ struct fSPMV { AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, eps * max_val); #else - Kokkos::printf( - "expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", i, j, - AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, - eps * max_val); + Kokkos::printf("expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", + i, j, AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), + error, eps * max_val); #endif } } From fefcb0f908966609d0285a4fd6dd6c7f10848796 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 28 Sep 2023 16:44:18 -0600 Subject: [PATCH 191/231] Test_Sparse_spmv_bsr.hpp: use custom optional to work around CUDA 11 ICE --- sparse/unit_test/Test_Sparse_Controls.hpp | 18 ++++++ sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 69 ++++++++++++++--------- 2 files changed, 61 insertions(+), 26 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_Controls.hpp b/sparse/unit_test/Test_Sparse_Controls.hpp index 7da8e19e97..79679f8173 100644 --- a/sparse/unit_test/Test_Sparse_Controls.hpp +++ b/sparse/unit_test/Test_Sparse_Controls.hpp @@ -38,7 +38,25 @@ void test_controls_set() { EXPECT_EQ(c.getParameter("", "default"), "default"); } +void test_controls_il() { + { + KokkosKernels::Experimental::Controls c({{"key1", "val1"}}); + EXPECT_EQ(c.isParameter("blah"), false); + EXPECT_EQ(c.getParameter("blah"), ""); + EXPECT_EQ(c.getParameter("key1"), "val1"); + } + { + KokkosKernels::Experimental::Controls c( + {{"key1", "val1"}, {"key2", "val2"}}); + EXPECT_EQ(c.isParameter("blah"), false); + EXPECT_EQ(c.getParameter("blah"), ""); + EXPECT_EQ(c.getParameter("key1"), "val1"); + EXPECT_EQ(c.getParameter("key2"), "val2"); + } +} + TEST_F(TestCategory, controls_empty) { test_controls_empty(); } TEST_F(TestCategory, controls_set) { test_controls_set(); } +TEST_F(TestCategory, controls_il) { test_controls_il(); } #endif // TEST_SPARSE_CONTROLS_HPP diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index 164c87b8df..b80d3678b1 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -53,6 +53,29 @@ using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; +/* Poor-man's std::optional since CUDA 11.0 seems to have an ICE + https://github.com/kokkos/kokkos-kernels/issues/1943 +*/ +struct OptCtrls { + KokkosKernels::Experimental::Controls ctrls_; + bool present_; + + OptCtrls() : present_(false) {} + OptCtrls(const KokkosKernels::Experimental::Controls &ctrls) + : present_(true), ctrls_(ctrls) {} + + operator bool() const { return present_; } + + constexpr const KokkosKernels::Experimental::Controls &operator*() + const &noexcept { + return ctrls_; + } + constexpr const KokkosKernels::Experimental::Controls *operator->() const + noexcept { + return &ctrls_; + } +}; + namespace Test_Spmv_Bsr { /*! \brief Maximum value used to fill A */ @@ -151,10 +174,9 @@ Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { template -void test_spmv( - const std::optional &controls, - const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, - const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { +void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow, const XVector &x, const YVector &y) { using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; using KATS = Kokkos::ArithTraits; @@ -369,21 +391,19 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, // cover a variety of controls using Ctrls = KokkosKernels::Experimental::Controls; - using OptCtrls = std::optional; - std::vector ctrls = { - std::nullopt, // no controls - OptCtrls(std::in_place, Ctrls()), - OptCtrls(std::in_place, Ctrls({{"algorithm", "tpl"}})), - OptCtrls(std::in_place, Ctrls({{"algorithm", "v4.1"}}))}; + std::vector ctrls = {OptCtrls(), // no controls + OptCtrls(Ctrls()), // empty controls + OptCtrls(Ctrls({{"algorithm", "tpl"}})), + OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - ctrls.push_back(Ctrls({{"algorithm", "experimental_tc"}})); + ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); #if defined(KOKKOS_ARCH_AMPERE) - ctrls.push_back(Ctrls( - {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}})); + ctrls.push_back(OptCtrls(Ctrls( + {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); #endif // AMPERE #endif // AMPERE || VOLTA } @@ -481,10 +501,9 @@ void test_spmv() { // it's for A. template -void test_spm_mv( - const std::optional &controls, - const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, - const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { +void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow, const XVector &x, const YVector &y) { using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; using KATS = Kokkos::ArithTraits; @@ -607,21 +626,19 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, // cover a variety of controls using Ctrls = KokkosKernels::Experimental::Controls; - using OptCtrls = std::optional; - std::vector ctrls = { - std::nullopt, // no controls - OptCtrls(std::in_place, Ctrls()), - OptCtrls(std::in_place, Ctrls({{"algorithm", "tpl"}})), - OptCtrls(std::in_place, Ctrls({{"algorithm", "v4.1"}}))}; + std::vector ctrls = {OptCtrls(), // no controls + OptCtrls(Ctrls()), // empty controls + OptCtrls(Ctrls({{"algorithm", "tpl"}})), + OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - ctrls.push_back(Ctrls({{"algorithm", "experimental_tc"}})); + ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); #if defined(KOKKOS_ARCH_AMPERE) - ctrls.push_back(Ctrls( - {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}})); + ctrls.push_back(OptCtrls(Ctrls( + {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); #endif // AMPERE #endif // AMPERE || VOLTA } From 414cfdcaae6b044c1d0005769b40e62992800d64 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 3 Oct 2023 07:26:14 -0600 Subject: [PATCH 192/231] Sparse: spmv bsr CUDA11 ICE Fixing issue with constructor that does not follow the order of attribute declaration --- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index b80d3678b1..5b823a22f7 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -57,8 +57,8 @@ using kokkos_complex_float = Kokkos::complex; https://github.com/kokkos/kokkos-kernels/issues/1943 */ struct OptCtrls { - KokkosKernels::Experimental::Controls ctrls_; bool present_; + KokkosKernels::Experimental::Controls ctrls_; OptCtrls() : present_(false) {} OptCtrls(const KokkosKernels::Experimental::Controls &ctrls) From 7d3095fda0d8502826a05c81089b00a68ffaf18b Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 3 Oct 2023 13:41:55 -0600 Subject: [PATCH 193/231] cm_test_all_sandia: update caraway mi2*0 queue modules --- scripts/cm_test_all_sandia | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 15bbb53711..f939060320 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -177,12 +177,11 @@ if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name fi if [[ "$HOSTNAME" == fat* ]]; then # Caraway MI250 queues - MACHINE=caraway - source /etc/profile.d/lmod.sh + MACHINE=vega90a_caraway fi if [[ "$HOSTNAME" == lean* ]]; then # Caraway MI210 queues - MACHINE=caraway + MACHINE=vega90a_caraway fi if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then @@ -699,6 +698,38 @@ elif [ "$MACHINE" = "caraway" ]; then if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=VEGA908" fi +elif [ "$MACHINE" = "vega90a_caraway" ]; then + SKIP_HWLOC=True + # BUILD_ONLY=True + # report_and_log_test_result: only testing compilation of code for now, + # output description and success based only on build succes; build time output (no run-time) + + BASE_MODULE_LIST="cmake,/" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" + + HIPCLANG_BUILD_LIST="Hip_Serial" + HIPCLANG_WARNING_FLAGS="" + + if [ "$SPOT_CHECK_TPLS" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.6.0 $ROCM520_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.6.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "rocm/5.6.1 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + ) + fi + + + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=VEGA90A" + fi elif [ "$MACHINE" = "blake" ]; then MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" eval "$MODULE_ENVIRONMENT" From 73472a813939b89a157e7aee45839771b1aeef6a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 5 Oct 2023 11:47:29 -0600 Subject: [PATCH 194/231] CMake: Adding logic to catch bad Kokkos version While configuring Kokkos Kernels look for the Kokkos package and check that its version is compatible with the current version of Kokkos Kernels. If not we fail configuration with a helpful error message. --- CMakeLists.txt | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77156d8ec7..8b37322e90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,9 +124,13 @@ ELSE() # Regular build, not install testing # Do all the regular option processing IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) - # This is a standalone build - FIND_PACKAGE(Kokkos REQUIRED) - MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") + # This is a standalone build + FIND_PACKAGE(Kokkos REQUIRED) + IF((${Kokkos_VERSION} VERSION_EQUAL "4.0.1") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.99")) + MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") + ELSE() + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.0.1, 4.1.00 or develop (4.1.99)") + ENDIF() ENDIF() INCLUDE(cmake/kokkos_backends.cmake) From 64d73e9d7ab527445664e44d9bf186a29b9adb94 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Sat, 7 Oct 2023 01:25:44 -0600 Subject: [PATCH 195/231] Only deep_copy from device to host if supernodal sptrsv --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 42 ++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index b7e7fa1650..6a770583b6 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2908,10 +2908,9 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, // Keep this a host View, create device version and copy to back to host // during scheduling This requires making sure the host view in the handle is // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -2920,15 +2919,23 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; using range_type = Kokkos::pair; + using row_map_host_view_t = Kokkos::View row_map_host; + + row_map_host_view_t row_map_host; const scalar_t zero(0.0); const scalar_t one(1.0); - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - Kokkos::View row_map_host( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); + auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); + + row_map_host = row_map_host_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0)); + Kokkos::deep_copy(row_map_host, row_map); + } // inversion options const bool invert_diagonal = thandle.get_invert_diagonal(); @@ -3293,19 +3300,24 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; + using range_type = Kokkos::pair; + using row_map_host_view_t = Kokkos::View row_map_host; - using range_type = Kokkos::pair; + row_map_host_view_t row_map_host; const scalar_t zero(0.0); const scalar_t one(1.0); auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - Kokkos::View row_map_host( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); + + row_map_host = row_map_host_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0)); + Kokkos::deep_copy(row_map_host, row_map); + } // supernode sizes const int *supercols = thandle.get_supercols(); From 02af603a3a0dc7b979ed469329c7144b145fdd70 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Sat, 7 Oct 2023 01:34:49 -0600 Subject: [PATCH 196/231] Apply clang format --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 6a770583b6..9124b1ee21 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2919,7 +2919,8 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; using range_type = Kokkos::pair; - using row_map_host_view_t = Kokkos::View row_map_host; + using row_map_host_view_t = + Kokkos::View row_map_host; row_map_host_view_t row_map_host; @@ -2933,7 +2934,9 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - row_map_host = row_map_host_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0)); + row_map_host = row_map_host_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); Kokkos::deep_copy(row_map_host, row_map); } @@ -3301,7 +3304,8 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; using range_type = Kokkos::pair; - using row_map_host_view_t = Kokkos::View row_map_host; + using row_map_host_view_t = + Kokkos::View row_map_host; row_map_host_view_t row_map_host; @@ -3315,7 +3319,9 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - row_map_host = row_map_host_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0)); + row_map_host = row_map_host_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); Kokkos::deep_copy(row_map_host, row_map); } From 7bf18914066b86f1a411273f5054930ba02b4712 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Sat, 7 Oct 2023 02:07:14 -0600 Subject: [PATCH 197/231] Fix compile error --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 9124b1ee21..b14c9be072 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2919,8 +2919,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; using range_type = Kokkos::pair; - using row_map_host_view_t = - Kokkos::View row_map_host; + using row_map_host_view_t = Kokkos::View; row_map_host_view_t row_map_host; @@ -3304,8 +3303,7 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; using range_type = Kokkos::pair; - using row_map_host_view_t = - Kokkos::View row_map_host; + using row_map_host_view_t = Kokkos::View; row_map_host_view_t row_map_host; From f394768f5941070be4e5fb0f0ca93b178fd9a1c7 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 6 Oct 2023 14:11:07 -0600 Subject: [PATCH 198/231] upstream iostream removal fix --- batched/KokkosBatched_Util.hpp | 1 + example/half/xpy.cpp | 1 + sparse/src/KokkosSparse_Utils_rocsparse.hpp | 6 ++++-- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 7 +++++-- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 614a98dedb..9078281e59 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -31,6 +31,7 @@ #include #include +#include #include "Kokkos_Complex.hpp" diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index 16231d64fe..238fdef187 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -17,6 +17,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" #include "KokkosKernels_default_types.hpp" +#include template struct Functor_xpy { diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index 6f79844782..cc34e55093 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -18,6 +18,7 @@ #define _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP #include +#include #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #include @@ -101,8 +102,9 @@ inline rocsparse_operation mode_kk_to_rocsparse(const char kk_mode[]) { myRocsparseOperation = rocsparse_operation_conjugate_transpose; break; default: { - std::cerr << "Mode " << kk_mode[0] << " invalid for rocSPARSE SpMV.\n"; - throw std::invalid_argument("Invalid mode"); + std::ostringstream out; + out << "Mode " << kk_mode[0] << " invalid for rocSPARSE SpMV.\n"; + throw std::invalid_argument(out.str()); } } return myRocsparseOperation; diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index e0c27099ea..23d85c0b5c 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ #define KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ +#include + #include "KokkosKernels_Controls.hpp" // cuSPARSE @@ -52,8 +54,9 @@ void spmv_cusparse(const Kokkos::Cuda& exec, myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; break; default: { - std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV.\n"; - throw std::invalid_argument("Invalid mode"); + std::ostringstream out; + out << "Mode " << mode << " invalid for cuSPARSE SpMV.\n"; + throw std::invalid_argument(out.str()); } } // cuSPARSE doesn't directly support mode H with real values, but this is From 2c103ed202e9cec8b325c50cc7b8e6d695ae72d4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 3 Oct 2023 14:45:22 -0600 Subject: [PATCH 199/231] blas: Test and fix gemv stream interface --- blas/impl/KokkosBlas2_gemv_spec.hpp | 19 +- blas/src/KokkosBlas2_gemv.hpp | 28 ++- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 10 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 230 +++++++++--------- blas/unit_test/Test_Blas2_gemv.hpp | 86 +++++-- docs/developer/apidocs/blas2.rst | 2 +- 6 files changed, 221 insertions(+), 154 deletions(-) diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index 42e2465494..08842a61c0 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -27,7 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemv_eti_spec_avail { enum : bool { value = false }; }; @@ -44,6 +44,7 @@ struct gemv_eti_spec_avail { #define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct gemv_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -67,14 +68,14 @@ namespace Impl { // // Implementation of KokkosBlas::gemv. -template ::value, - bool eti_spec_avail = - gemv_eti_spec_avail::value> +template < + class ExecutionSpace, class AViewType, class XViewType, class YViewType, + bool tpl_spec_avail = gemv_tpl_spec_avail::value, + bool eti_spec_avail = gemv_eti_spec_avail::value> struct GEMV { - static void gemv(const typename AViewType::execution_space& space, - const char trans[], + static void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -130,6 +131,7 @@ struct GEMV { #define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct GEMV< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -142,6 +144,7 @@ struct GEMV { #define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct GEMV< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index dbfeb06537..40ac9db249 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -49,14 +49,14 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View -template -void gemv(const execution_space& space, const char trans[], +void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::gemv: execution_space must be a valid Kokkos " + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::gemv: ExecutionSpace must be a valid Kokkos " "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: AViewType must be a Kokkos::View."); @@ -71,17 +71,17 @@ void gemv(const execution_space& space, const char trans[], static_assert(static_cast(YViewType::rank) == 1, "KokkosBlas::gemv: YViewType must have rank 1."); static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: AViewType must be accessible from execution_space"); + "KokkosBlas::gemv: AViewType must be accessible from ExecutionSpace"); static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: XViewType must be accessible from execution_space"); + "KokkosBlas::gemv: XViewType must be accessible from ExecutionSpace"); static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: YViewType must be accessible from execution_space"); + "KokkosBlas::gemv: YViewType must be accessible from ExecutionSpace"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { @@ -171,11 +171,13 @@ void gemv(const execution_space& space, const char trans[], if (useFallback) { const bool eti_spec_avail = - KokkosBlas::Impl::gemv_eti_spec_avail::value; - typedef Impl::GEMV fallback_impl_type; + KokkosBlas::Impl::gemv_eti_spec_avail::value; + typedef Impl::GEMV + fallback_impl_type; fallback_impl_type::gemv(space, trans, alpha, A, x, beta, y); } else { - typedef Impl::GEMV impl_type; + typedef Impl::GEMV impl_type; impl_type::gemv(space, trans, alpha, A, x, beta, y); } } diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 1f5dde5b04..1496eee020 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemv_tpl_spec_avail { enum : bool { value = false }; }; @@ -32,6 +32,7 @@ struct gemv_tpl_spec_avail { LAYOUTY, MEMSPACE) \ template \ struct gemv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -78,6 +79,7 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, LAYOUTY, MEMSPACE) \ template \ struct gemv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -126,8 +128,9 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS #define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template <> \ + template \ struct gemv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ @@ -164,8 +167,9 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #ifdef KOKKOS_ENABLE_SYCL #define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ - template <> \ + template \ struct gemv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 7aa854b962..894ce884ee 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -43,51 +43,52 @@ namespace Impl { transa = 'C'; \ } -#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ - one, beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ + one, beta, Y.data(), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -111,8 +112,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -128,7 +128,8 @@ namespace Impl { #define KOKKOSBLAS2_ZGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMV**, LAYOUTA, \ + struct GEMV**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUTX, \ @@ -152,8 +153,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -171,50 +171,50 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_BLAS,complex]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - const std::complex alpha_val = alpha, beta_val = beta; \ - HostBlas >::gemv( \ - transa, M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(X.data()), one, \ - beta_val, reinterpret_cast*>(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_BLAS,complex]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv( \ + transa, M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(X.data()), one, \ + beta_val, reinterpret_cast*>(Y.data()), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, @@ -288,6 +288,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -311,8 +312,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -335,6 +335,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -358,8 +359,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -381,7 +381,8 @@ namespace Impl { #define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMV**, LAYOUTA, \ + struct GEMV**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUTX, \ @@ -405,8 +406,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -433,7 +433,8 @@ namespace Impl { #define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMV**, LAYOUTA, \ + struct GEMV**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUTX, \ @@ -457,8 +458,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -549,8 +549,9 @@ namespace Impl { } #define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ + template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -575,8 +576,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -596,8 +596,9 @@ namespace Impl { }; #define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ + template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -622,8 +623,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -643,8 +643,9 @@ namespace Impl { }; #define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ + template \ struct GEMV< \ + ExecSpace, \ Kokkos::View**, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ @@ -669,8 +670,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -695,8 +695,9 @@ namespace Impl { }; #define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ + template \ struct GEMV< \ + ExecSpace, \ Kokkos::View**, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ @@ -721,8 +722,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -818,8 +818,9 @@ struct kokkos_to_std_type_map { }; #define KOKKOSBLAS2_GEMV_ONEMKL(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ + template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -830,16 +831,15 @@ struct kokkos_to_std_type_map { Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ - using execution_space = Kokkos::Experimental::SYCL; \ - using device_type = Kokkos::Device; \ - using mem_traits = Kokkos::MemoryTraits; \ + using device_type = Kokkos::Device; \ + using mem_traits = Kokkos::MemoryTraits; \ using AViewType = \ Kokkos::View; \ using XViewType = \ Kokkos::View; \ using YViewType = Kokkos::View; \ \ - static void gemv(const execution_space& exec, const char kk_trans[], \ + static void gemv(const ExecSpace& exec, const char kk_trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index 518e7b8055..b3f3566f83 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -21,8 +21,10 @@ #include namespace Test { -template -void impl_test_gemv(const char* mode, int M, int N) { +template +void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, + int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeX::value_type ScalarX; typedef typename ViewTypeY::value_type ScalarY; @@ -47,8 +49,7 @@ void impl_test_gemv(const char* mode, int M, int N) { view_stride_adapter y("Y", ldy); view_stride_adapter org_y("Org_Y", ldy); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); constexpr double max_valX = 1; constexpr double max_valY = 1; @@ -56,17 +57,17 @@ void impl_test_gemv(const char* mode, int M, int N) { { ScalarX randStart, randEnd; Test::getRandomBounds(max_valX, randStart, randEnd); - Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(space, x.d_view, rand_pool, randStart, randEnd); } { ScalarY randStart, randEnd; Test::getRandomBounds(max_valY, randStart, randEnd); - Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(space, y.d_view, rand_pool, randStart, randEnd); } { ScalarA randStart, randEnd; Test::getRandomBounds(max_valA, randStart, randEnd); - Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(space, A.d_view, rand_pool, randStart, randEnd); } const typename KAT_Y::mag_type max_error = @@ -82,7 +83,7 @@ void impl_test_gemv(const char* mode, int M, int N) { Kokkos::deep_copy(expected, org_y.h_view); vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); - KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); int numErrors = 0; for (int i = 0; i < ldy; i++) { @@ -97,10 +98,12 @@ void impl_test_gemv(const char* mode, int M, int N) { << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y.d_base, org_y.h_base); - KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(space, y.d_base, org_y.h_base); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view_const, beta, + y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; + Kokkos::fence(); // Wait for vanillaGEMV for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } @@ -108,8 +111,9 @@ void impl_test_gemv(const char* mode, int M, int N) { << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y.d_base, org_y.h_base); - KokkosBlas::gemv(mode, alpha, A.d_view_const, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(space, y.d_base, org_y.h_base); + KokkosBlas::gemv(space, mode, alpha, A.d_view_const, x.d_view_const, beta, + y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { @@ -123,9 +127,11 @@ void impl_test_gemv(const char* mode, int M, int N) { beta = KAT_Y::zero(); // beta changed, so update the correct answer vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); - Kokkos::deep_copy(y.d_view, KAT_Y::nan()); - KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + Kokkos::deep_copy(space, y.d_view, KAT_Y::nan()); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); + + Kokkos::fence(); // Wait for vanillaGEMV numErrors = 0; for (int i = 0; i < ldy; i++) { if (KAT_Y::isNan(y.h_view(i)) || @@ -141,6 +147,13 @@ void impl_test_gemv(const char* mode, int M, int N) { EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode << ": gemv incorrect"; } +template +void impl_test_gemv(const char* mode, int M, int N) { + using execution_space = typename Device::execution_space; + execution_space space; + impl_test_gemv_streams(space, mode, M, N); +} } // namespace Test template @@ -310,3 +323,48 @@ TEST_F(TestCategory, gemv_double_int) { // Kokkos::Profiling::popRegion(); } #endif + +template +int test_gemv_streams(const char* mode) { + using execution_space = typename Device::execution_space; + execution_space space; +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + using view_type_a_ll = Kokkos::View; + using view_type_b_ll = Kokkos::View; + using view_type_c_ll = Kokkos::View; + Test::impl_test_gemv_streams(space, mode, 0, 1024); + Test::impl_test_gemv_streams(space, mode, 13, 1024); + Test::impl_test_gemv_streams(space, mode, 50, 40); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + using view_type_a_lr = Kokkos::View; + using view_type_b_lr = Kokkos::View; + using view_type_c_lr = Kokkos::View; + Test::impl_test_gemv_streams(space, mode, 0, 1024); + Test::impl_test_gemv_streams(space, mode, 13, 1024); + Test::impl_test_gemv_streams(space, mode, 50, 40); +#endif + (void)space; + return 1; +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + blas##_##gemv_streams##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gemv_streams("N"); \ + test_gemv_streams("T"); \ + } + +#define NO_TEST_COMPLEX + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST +#undef NO_TEST_COMPLEX \ No newline at end of file diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst index 9d96567929..434e9caf03 100644 --- a/docs/developer/apidocs/blas2.rst +++ b/docs/developer/apidocs/blas2.rst @@ -3,8 +3,8 @@ BLAS2 -- KokkosKernels blas2 interfaces gemv ---- +.. doxygenfunction:: KokkosBlas::gemv(const ExecutionSpace &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) .. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) -.. doxygenfunction:: KokkosBlas::gemv(const execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) ger ---- From 7e6e8c6ce81cb5bb9036a053221fbb5c9425aee5 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 11 Oct 2023 12:05:38 -0600 Subject: [PATCH 200/231] Another upstream iostream removal fix Fix for compilation errors with cuda/11.2 and tpls enabled Pattern match from #1991 --- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index dbf94c913d..30e0b6e243 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ #define KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ +#include + #include "KokkosKernels_Controls.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -119,8 +121,9 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, case 'T': opA = CUSPARSE_OPERATION_TRANSPOSE; break; case 'H': opA = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; break; default: { - std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV MV.\n"; - throw std::invalid_argument("Invalid mode"); + std::ostringstream out; + out << "Mode " << mode << " invalid for cuSPARSE SpMV MV.\n"; + throw std::invalid_argument(out.str()); } } From 470a98875b9f7c3889418128263935d036004e6f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 26 Sep 2023 09:10:16 -0600 Subject: [PATCH 201/231] Common: remove half and bhalf implementations Both half and bhalf implementation details have been moved to Kokkos Core so their implementation in ArithTraits is replaced by the macro that calls Kokkos Core math functions instead. --- common/src/KokkosKernels_Half.hpp | 2 + common/src/Kokkos_ArithTraits.hpp | 50 ++++++++++++++++--------- test_common/KokkosKernels_TestUtils.hpp | 2 + 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/common/src/KokkosKernels_Half.hpp b/common/src/KokkosKernels_Half.hpp index eddd2f1e4c..c22646b5aa 100644 --- a/common/src/KokkosKernels_Half.hpp +++ b/common/src/KokkosKernels_Half.hpp @@ -14,6 +14,7 @@ // //@HEADER +#if KOKKOS_VERSION < 40199 #ifndef KOKKOSKERNELS_HALF_HPP #define KOKKOSKERNELS_HALF_HPP @@ -61,3 +62,4 @@ namespace Experimental { } // namespace Experimental } // namespace KokkosKernels #endif // KOKKOSKERNELS_HALF_HPP +#endif // KOKKOS_VERSION < 40199 diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 31744f7a8f..a5ea7c6376 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -25,7 +25,9 @@ #include #include #include +#if KOKKOS_VERSION < 40199 #include +#endif #include @@ -197,8 +199,6 @@ KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, namespace Kokkos { // Macro to automate the wrapping of Kokkos Mathematical Functions -// in the ArithTraits struct for real floating point types, hopefully -// this can be expanded to Kokkos::half_t and Kokkos::bhalf_t #define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ static FUNC_QUAL val_type zero() { return static_cast(0); } \ static FUNC_QUAL val_type one() { return static_cast(1); } \ @@ -912,8 +912,6 @@ class ArithTraits { //@} }; -// Since Kokkos::Experimental::half_t falls back to float, only define -// ArithTraits if half_t is a backend specialization #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT template <> class ArithTraits { @@ -926,8 +924,9 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; - static constexpr bool has_infinity = true; + +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_half( Kokkos::Experimental::infinity::value); @@ -1028,16 +1027,21 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type epsilon() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); } +#endif + // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - // C++ doesn't have a standard "half-float" type. - using halfPrecision = val_type; - using doublePrecision = double; + using magnitudeType = mag_type; + using halfPrecision = Kokkos::Experimental::half_t; + using doublePrecision = float; + + static std::string name() { return "half_t"; } static constexpr bool isComplex = false; static constexpr bool isOrdinal = false; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = true; + +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } @@ -1047,7 +1051,6 @@ class ArithTraits { static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "half"; } static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } @@ -1077,8 +1080,11 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } +#else + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) +#endif }; -#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF +#endif // #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT // Since Kokkos::Experimental::bhalf_t falls back to float, only define // ArithTraits if bhalf_t is a backend specialization @@ -1094,8 +1100,9 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; - static constexpr bool has_infinity = true; + +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_bhalf( Kokkos::Experimental::infinity::value); @@ -1193,16 +1200,23 @@ class ArithTraits { // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); } +#endif + // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; - // C++ doesn't have a standard "bhalf-float" type. - using bhalfPrecision = val_type; - using doublePrecision = double; + using bhalfPrecision = Kokkos::Experimental::bhalf_t; + // There is no type that has twice the precision as bhalf_t. + // The closest type would be float. + using doublePrecision = void; static constexpr bool isComplex = false; static constexpr bool isOrdinal = false; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = true; + + static std::string name() { return "bhalf_t"; } + +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } @@ -1212,7 +1226,6 @@ class ArithTraits { static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "bhalf"; } static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } @@ -1242,8 +1255,11 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } +#else + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) +#endif }; -#endif // KOKKOS_BHALF_T_IS_FLOAT +#endif // #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT template <> class ArithTraits { diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 330c00cde6..236bcdd1c8 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -411,6 +411,7 @@ class epsilon { constexpr static double value = std::numeric_limits::epsilon(); }; +#if KOKKOS_VERSION < 40199 // explicit epsilon specializations #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT template <> @@ -428,6 +429,7 @@ class epsilon { constexpr static double value = 0.0078125F; }; #endif // KOKKOS_HALF_T_IS_FLOAT +#endif // KOKKOS_VERSION < 40199 using KokkosKernels::Impl::getRandomBounds; From 74bdf29a0c007fa6c9c96d9e7e5930027d082775 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 26 Sep 2023 09:13:09 -0600 Subject: [PATCH 202/231] Apply clang-format --- common/src/Kokkos_ArithTraits.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index a5ea7c6376..7eb8762487 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -924,7 +924,7 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; - static constexpr bool has_infinity = true; + static constexpr bool has_infinity = true; #if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION val_type infinity() { @@ -1100,7 +1100,7 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; - static constexpr bool has_infinity = true; + static constexpr bool has_infinity = true; #if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION val_type infinity() { @@ -1203,8 +1203,8 @@ class ArithTraits { #endif // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using bhalfPrecision = Kokkos::Experimental::bhalf_t; + using magnitudeType = mag_type; + using bhalfPrecision = Kokkos::Experimental::bhalf_t; // There is no type that has twice the precision as bhalf_t. // The closest type would be float. using doublePrecision = void; From 7b12a5f420a94b1d484198228debe9b93298d787 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 2 Oct 2023 08:06:04 -0600 Subject: [PATCH 203/231] Address CI failures --- common/unit_test/Test_Common_ArithTraits.hpp | 36 ++++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 9ed9eea99d..9834e48a36 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -413,9 +413,20 @@ class ArithTraitsTesterBase { } if (AT::has_infinity) { - if (!AT::isInf(AT::infinity())) { - out << "AT::isInf (inf) != true" << endl; - FAILURE(); +// Compiler intrinsic casts from inf of type half_t / bhalf_t to inf +// of type float in CUDA, SYCL and HIP do not work yet. +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_HIP) + namespace KE = Kokkos::Experimental; + if constexpr (!std::is_same::value && + !std::is_same::value) { +#else + { +#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP + if (!AT::isInf(AT::infinity())) { + out << "AT::isInf (inf) != true" << endl; + FAILURE(); + } } } if (!std::is_same::value) { @@ -1495,13 +1506,24 @@ class ArithTraitsTesterFloatingPointBase FAILURE(); } - if (!AT::isNan(AT::nan())) { +// Compiler intrinsic casts from nan of type half_t / bhalf_t to nan +// of type float in CUDA, SYCL and HIP do not work yet. +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_HIP) + namespace KE = Kokkos::Experimental; + if constexpr (!std::is_same::value && + !std::is_same::value) { +#else + { +#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP + if (!AT::isNan(AT::nan())) { #if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("NaN is not NaN\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("NaN is not NaN\n"); #else - Kokkos::printf("NaN is not NaN\n"); + Kokkos::printf("NaN is not NaN\n"); #endif - FAILURE(); + FAILURE(); + } } const ScalarType zero = AT::zero(); From 7ca166d414b6634dba6626200f2f41b5830f9ae9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 11 Oct 2023 16:32:10 -0600 Subject: [PATCH 204/231] Arith Traits: fix issue with isnan for HIP and SYCL with half_t Basically that function is not supported for these backend so we remove it and remove the associated testing. --- common/src/Kokkos_ArithTraits.hpp | 81 ++++++++++++++++++++ common/unit_test/Test_Common_ArithTraits.hpp | 43 +++++++++++ 2 files changed, 124 insertions(+) diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 7eb8762487..17296185e7 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -279,6 +279,83 @@ namespace Kokkos { static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } +// Macro to automate the wrapping of Kokkos Mathematical Functions +#define KOKKOSKERNELS_ARITHTRAITS_HALF_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ + static FUNC_QUAL val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static FUNC_QUAL val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static FUNC_QUAL val_type infinity() { \ + return Kokkos::Experimental::infinity::value; \ + } \ + static FUNC_QUAL val_type nan() { \ + return Kokkos::Experimental::quiet_NaN::value; \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return Kokkos::Experimental::epsilon::value; \ + } \ + static FUNC_QUAL mag_type sfmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int base() { \ + return Kokkos::Experimental::radix::value; \ + } \ + static FUNC_QUAL mag_type prec() { \ + return epsilon() * static_cast(base()); \ + } \ + static FUNC_QUAL int t() { \ + return Kokkos::Experimental::digits::value; \ + } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { \ + return Kokkos::Experimental::min_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int emax() { \ + return Kokkos::Experimental::max_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmax() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ + static FUNC_QUAL mag_type eps() { return epsilon(); } + #define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ \ static constexpr bool is_specialized = true; \ @@ -1080,9 +1157,13 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } +#else +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ARITHTRAITS_HALF_FP(KOKKOS_FUNCTION) #else KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) #endif +#endif }; #endif // #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 9834e48a36..058281675d 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -1545,6 +1545,8 @@ class ArithTraitsTesterFloatingPointBase #endif FAILURE(); } +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP + if constexpr(!std::is_same_v) { if (AT::isNan(zero)) { #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); @@ -1561,6 +1563,25 @@ class ArithTraitsTesterFloatingPointBase #endif FAILURE(); } + } +#else + if (AT::isNan(zero)) { +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); +#else + Kokkos::printf("0 is NaN\n"); +#endif + FAILURE(); + } + if (AT::isNan(one)) { +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); +#else + Kokkos::printf("1 is NaN\n"); +#endif + FAILURE(); + } +#endif // Call the base class' implementation. Every subclass' // implementation of operator() must do this, in order to include @@ -1585,10 +1606,19 @@ class ArithTraitsTesterFloatingPointBase // if (std::numeric_limits::is_iec559) { // success = success && AT::isInf (AT::inf ()); +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + if constexpr (!std::is_same_v) { + if (!AT::isNan(AT::nan())) { + out << "isNan or nan failed" << endl; + FAILURE(); + } + } +#else if (!AT::isNan(AT::nan())) { out << "isNan or nan failed" << endl; FAILURE(); } +#endif //} const ScalarType zero = AT::zero(); @@ -1602,6 +1632,18 @@ class ArithTraitsTesterFloatingPointBase out << "isInf(one) is 1" << endl; FAILURE(); } +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + if constexpr (!std::is_same_v) { + if (AT::isNan(zero)) { + out << "isNan(zero) is 1" << endl; + FAILURE(); + } + if (AT::isNan(one)) { + out << "isNan(one) is 1" << endl; + FAILURE(); + } + } +#else if (AT::isNan(zero)) { out << "isNan(zero) is 1" << endl; FAILURE(); @@ -1610,6 +1652,7 @@ class ArithTraitsTesterFloatingPointBase out << "isNan(one) is 1" << endl; FAILURE(); } +#endif // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in From caef00b5c90577feeefaea69ea4d68fe2b2983e5 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 11 Oct 2023 16:34:32 -0600 Subject: [PATCH 205/231] Arith Traits: applying clang-format --- common/unit_test/Test_Common_ArithTraits.hpp | 49 ++++++++++---------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 058281675d..1d9a4c6480 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -1545,24 +1545,25 @@ class ArithTraitsTesterFloatingPointBase #endif FAILURE(); } -#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP - if constexpr(!std::is_same_v) { - if (AT::isNan(zero)) { +#if defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP + if constexpr (!std::is_same_v) { + if (AT::isNan(zero)) { #if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); #else - Kokkos::printf("0 is NaN\n"); + Kokkos::printf("0 is NaN\n"); #endif - FAILURE(); - } - if (AT::isNan(one)) { + FAILURE(); + } + if (AT::isNan(one)) { #if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); #else - Kokkos::printf("1 is NaN\n"); + Kokkos::printf("1 is NaN\n"); #endif - FAILURE(); - } + FAILURE(); + } } #else if (AT::isNan(zero)) { @@ -1608,11 +1609,11 @@ class ArithTraitsTesterFloatingPointBase // success = success && AT::isInf (AT::inf ()); #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) if constexpr (!std::is_same_v) { - if (!AT::isNan(AT::nan())) { - out << "isNan or nan failed" << endl; - FAILURE(); - } + if (!AT::isNan(AT::nan())) { + out << "isNan or nan failed" << endl; + FAILURE(); } + } #else if (!AT::isNan(AT::nan())) { out << "isNan or nan failed" << endl; @@ -1634,14 +1635,14 @@ class ArithTraitsTesterFloatingPointBase } #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) if constexpr (!std::is_same_v) { - if (AT::isNan(zero)) { - out << "isNan(zero) is 1" << endl; - FAILURE(); - } - if (AT::isNan(one)) { - out << "isNan(one) is 1" << endl; - FAILURE(); - } + if (AT::isNan(zero)) { + out << "isNan(zero) is 1" << endl; + FAILURE(); + } + if (AT::isNan(one)) { + out << "isNan(one) is 1" << endl; + FAILURE(); + } } #else if (AT::isNan(zero)) { From 3ad6204d06d50f7d237da33b550f11707a6e55ee Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 20 Jul 2023 14:47:26 -0600 Subject: [PATCH 206/231] Adds the merge-based spmv behind an opt-in `controls.setParameter("algorithm", "merge")`. This SpMV is up to 200x faster than the existing native implementation for matrices with highly skewed row lengths. * Adds `KokkosSparse::Impl::diagonal_search` to support the merge-based SpMV * Moves `KokkosKernels_Iota.hpp` into the `impl` directory * Removes the static assert for `Kokkos::View` and `Kokkos::Iota` from `lower_bound` and friends, since they can also operate on things that appear like a view but are not present in the common component * Changes `KokkosSparse::Impl::MergeMatrixDiagonal::MatrixPosition` -> `KokkosSparse::Impl::MergeMatrixPosition`. Various `MergeMatrixDiagonals` have the same `MatrixPosition` type, which were technically different because they were scoped. --- common/{src => impl}/KokkosKernels_Iota.hpp | 0 common/src/KokkosKernels_LowerBound.hpp | 22 +- .../KokkosSparse_merge_matrix.hpp} | 132 ++++++- sparse/impl/KokkosSparse_spmv_impl.hpp | 35 +- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 373 ++++++++++++++++++ sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 14 +- sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 40 +- sparse/unit_test/Test_Sparse_spmv.hpp | 75 ++-- 8 files changed, 588 insertions(+), 103 deletions(-) rename common/{src => impl}/KokkosKernels_Iota.hpp (100%) rename sparse/{src/KokkosSparse_MergeMatrix.hpp => impl/KokkosSparse_merge_matrix.hpp} (56%) create mode 100644 sparse/impl/KokkosSparse_spmv_impl_merge.hpp diff --git a/common/src/KokkosKernels_Iota.hpp b/common/impl/KokkosKernels_Iota.hpp similarity index 100% rename from common/src/KokkosKernels_Iota.hpp rename to common/impl/KokkosKernels_Iota.hpp diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp index 160bd496f3..e091932453 100644 --- a/common/src/KokkosKernels_LowerBound.hpp +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -77,8 +77,8 @@ namespace Impl { /*! \brief Single-thread sequential lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -96,9 +96,6 @@ lower_bound_sequential_thread( using size_type = typename ViewLike::size_type; static_assert(1 == ViewLike::rank, "lower_bound_sequential_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, - "lower_bound_sequential_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); size_type i = 0; while (i < view.size() && pred(view(i), value)) { @@ -109,8 +106,8 @@ lower_bound_sequential_thread( /*! \brief Single-thread binary lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -127,9 +124,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( using size_type = typename ViewLike::size_type; static_assert(1 == ViewLike::rank, "lower_bound_binary_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, - "lower_bound_binary_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); size_type lo = 0; size_type hi = view.size(); @@ -150,8 +144,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( /*! \brief single-thread lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -168,10 +162,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_thread requires rank-1 views"); - static_assert(KokkosKernels::Impl::is_iota_v || - Kokkos::is_view::value, - "lower_bound_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); /* sequential search makes on average 0.5 * view.size memory accesses binary search makes log2(view.size)+1 accesses diff --git a/sparse/src/KokkosSparse_MergeMatrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp similarity index 56% rename from sparse/src/KokkosSparse_MergeMatrix.hpp rename to sparse/impl/KokkosSparse_merge_matrix.hpp index d573a5550f..83dfe42e84 100644 --- a/sparse/src/KokkosSparse_MergeMatrix.hpp +++ b/sparse/impl/KokkosSparse_merge_matrix.hpp @@ -20,13 +20,20 @@ #include #include "KokkosKernels_Iota.hpp" +#include "KokkosKernels_LowerBound.hpp" +#include "KokkosKernels_Predicates.hpp" #include "KokkosKernels_SafeCompare.hpp" -/// \file KokkosSparse_MergeMatrix.hpp +/// \file KokkosSparse_merge_matrix.hpp -namespace KokkosSparse { -namespace Experimental { -namespace Impl { +namespace KokkosSparse::Impl { + +// a joint index into a and b +template +struct MergeMatrixPosition { + AIndex ai; + BIndex bi; +}; /*! \class MergeMatrixDiagonal \brief a view into the entries of the Merge Matrix along a diagonal @@ -88,14 +95,7 @@ class MergeMatrixDiagonal { using a_value_type = typename AView::non_const_value_type; using b_value_type = typename BViewLike::non_const_value_type; - /*! \struct MatrixPosition - * \brief indices into the a_ and b_ views. - */ - struct MatrixPosition { - a_index_type ai; - b_index_type bi; - }; - using position_type = MatrixPosition; + using position_type = MergeMatrixPosition; // implement bare minimum parts of the view interface enum { rank = 1 }; @@ -145,9 +145,9 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION bool operator()(const size_type di) const { position_type pos = diag_to_a_b(di); - if (pos.ai >= a_.size()) { + if (size_t(pos.ai) >= a_.size()) { return true; // on the +a side out of matrix bounds is 1 - } else if (pos.bi >= b_.size()) { + } else if (size_t(pos.bi) >= b_.size()) { return false; // on the +b side out of matrix bounds is 0 } else { return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); @@ -192,8 +192,106 @@ class MergeMatrixDiagonal { size_type d_; ///< diagonal }; -} // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse +/*! \brief Return the first index on diagonal \code diag + in the merge matrix of \code a and \code b that is not 1 +This is effectively a lower-bound search on the merge matrix diagonal +where the predicate is "equals 1" +*/ +template +KOKKOS_INLINE_FUNCTION + typename MergeMatrixDiagonal::position_type + diagonal_search( + const AView &a, const BViewLike &b, + typename MergeMatrixDiagonal::size_type diag) { + // unmanaged view types for a and b + using um_a_view = + Kokkos::View; + using um_b_view = + Kokkos::View; + + um_a_view ua(a.data(), a.size()); + + // if BViewLike is an Iota, pass it on directly to MMD, + // otherwise, create an unmanaged view of B + using b_type = + typename std::conditional::value, + BViewLike, um_b_view>::type; + + using MMD = MergeMatrixDiagonal; + MMD mmd; + if constexpr (KokkosKernels::Impl::is_iota::value) { + mmd = MMD(ua, b, diag); + } else { + b_type ub(b.data(), b.size()); + mmd = MMD(ua, ub, diag); + } + + // returns index of the first element that does not satisfy pred(element, + // value) our input view is the merge matrix entry along the diagonal, and we + // want the first one that is not true. so our predicate just tells us if the + // merge matrix diagonal entry is equal to true or not + const typename MMD::size_type idx = KokkosKernels::lower_bound_thread( + mmd, true, KokkosKernels::Equal()); + return mmd.position(idx); +} + +template +KOKKOS_INLINE_FUNCTION + typename MergeMatrixDiagonal::position_type + diagonal_search( + const TeamMember &handle, const AView &a, const BViewLike &b, + typename MergeMatrixDiagonal::size_type diag) { + // unmanaged view types for a and b + using um_a_view = + Kokkos::View; + using um_b_view = + Kokkos::View; + + um_a_view ua(a.data(), a.size()); + + // if BViewLike is an Iota, pass it on directly to MMD, + // otherwise, create an unmanaged view of B + using b_type = + typename std::conditional::value, + BViewLike, um_b_view>::type; + + using MMD = MergeMatrixDiagonal; + MMD mmd; + if constexpr (KokkosKernels::Impl::is_iota::value) { + mmd = MMD(ua, b, diag); + } else { + b_type ub(b.data(), b.size()); + mmd = MMD(ua, ub, diag); + } + + // returns index of the first element that does not satisfy pred(element, + // value) our input view is the merge matrix entry along the diagonal, and we + // want the first one that is not true. so our predicate just tells us if the + // merge matrix diagonal entry is equal to true or not + const typename MMD::size_type idx = KokkosKernels::lower_bound_team( + handle, mmd, true, KokkosKernels::Equal()); + return mmd.position(idx); +} + +/*! \brief + + \return A MergeMatrixDiagonal::position_type + */ +template +KOKKOS_INLINE_FUNCTION auto diagonal_search( + const View &a, typename View::non_const_value_type totalWork, + typename View::size_type diag) { + using value_type = typename View::non_const_value_type; + using size_type = typename View::size_type; + + KokkosKernels::Impl::Iota iota(totalWork); + return diagonal_search(a, iota, diag); +} + +} // namespace KokkosSparse::Impl #endif // KOKKOSSPARSE_MERGEMATRIX_HPP diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index d00808558f..16717c6e62 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -17,17 +17,22 @@ #ifndef KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ #define KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ +#include + #include "KokkosKernels_Controls.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosBlas1_scal.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" +#include "KokkosSparse_spmv_impl_merge.hpp" #include "KokkosKernels_Error.hpp" namespace KokkosSparse { namespace Impl { +constexpr const char* KOKKOSSPARSE_ALG_MERGE = "merge"; + // This TransposeFunctor is functional, but not necessarily performant. template @@ -629,11 +634,21 @@ static void spmv_beta(const execution_space& exec, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { + SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, + beta, y); + } else { + spmv_beta_no_transpose( + exec, controls, alpha, A, x, beta, y); + } } else if (mode[0] == Conjugate[0]) { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { + SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, + beta, y); + } else { + spmv_beta_no_transpose( + exec, controls, alpha, A, x, beta, y); + } } else if (mode[0] == Transpose[0]) { spmv_beta_transpose(exec, alpha, A, x, beta, y); @@ -641,8 +656,10 @@ static void spmv_beta(const execution_space& exec, spmv_beta_transpose(exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } @@ -1460,8 +1477,10 @@ static void spmv_alpha_beta_mv( doalpha, dobeta, true>(exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp new file mode 100644 index 0000000000..7ed09ee477 --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -0,0 +1,373 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP +#define KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP + +#include + +#include "KokkosKernels_Iota.hpp" +#include "KokkosKernels_AlwaysFalse.hpp" + +#include "KokkosSparse_merge_matrix.hpp" + +namespace KokkosSparse::Impl { + +/*! \brief Merge-based SpMV + Hierarchical GPU implementation + Each team uses MergePath search to find the non-zeros and rows it is + responsible for Each thread in the team similarly uses diagonal search within + the team to determine which entries it will be responsible for + The threads then atomically accumulate partial produces +*/ +template +struct SpmvMergeHierarchical { + using device_type = typename YVector::device_type; + using exec_space = typename device_type::execution_space; + using y_value_type = typename YVector::non_const_value_type; + using x_value_type = typename XVector::non_const_value_type; + using A_value_type = typename AMatrix::non_const_value_type; + using A_ordinal_type = typename AMatrix::non_const_ordinal_type; + using A_size_type = typename AMatrix::non_const_size_type; + using row_map_non_const_value_type = + typename AMatrix::row_map_type::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using team_member = typename policy_type::member_type; + + using um_row_map_type = + Kokkos::View>; + + using row_map_scratch_type = + Kokkos::View>; + + using iota_type = KokkosKernels::Impl::Iota; + + using DSR = typename KokkosSparse::Impl::MergeMatrixDiagonal< + um_row_map_type, iota_type>::position_type; + + using KAT = Kokkos::ArithTraits; + + // results of a lower-bound and upper-bound diagonal search + struct Chunk { + DSR lb; // lower bound + DSR ub; // upper bound + }; + + template + struct Functor { + Functor(const y_value_type& _alpha, const AMatrix& _A, const XVector& _x, + const YVector& _y, const A_size_type pathLengthThreadChunk) + : alpha(_alpha), + A(_A), + x(_x), + y(_y), + pathLengthThreadChunk_(pathLengthThreadChunk) {} + + y_value_type alpha; + AMatrix A; + XVector x; + YVector y; + A_size_type pathLengthThreadChunk_; + + KOKKOS_INLINE_FUNCTION void operator()(const team_member& thread) const { + const A_size_type pathLengthTeamChunk = + thread.team_size() * pathLengthThreadChunk_; + + const A_size_type pathLength = A.numRows() + A.nnz(); + const A_size_type teamD = + thread.league_rank() * pathLengthTeamChunk; // diagonal + const A_size_type teamDEnd = + KOKKOSKERNELS_MACRO_MIN(teamD + pathLengthTeamChunk, pathLength); + + // iota(i) -> i + iota_type iota(A.nnz()); + + // remove leading 0 from row_map + um_row_map_type rowEnds(&A.graph.row_map(1), A.graph.row_map.size() - 1); + + // compiler thinks these are "used" in team_broadcast below, so initialize + // them with something to silence the warning + DSR lb{}; + DSR ub{}; + + // thread 0 does the lower bound, thread 1 does the upper bound + if (0 == thread.team_rank() || 1 == thread.team_rank()) { + const A_size_type d = thread.team_rank() ? teamDEnd : teamD; + DSR dsr = diagonal_search(rowEnds, iota, d); + if (0 == thread.team_rank()) { + lb = dsr; + } + if (1 == thread.team_rank()) { + ub = dsr; + } + } + thread.team_broadcast(lb, 0); + thread.team_broadcast(ub, 1); + const A_size_type teamNnzBegin = + lb.bi; // the first nnz this team will handle + const A_size_type teamNnzEnd = + ub.bi; // one-past the last nnz this team will handle + const A_ordinal_type teamRowBegin = + lb.ai; // <= the row than the first nnz is in + const A_ordinal_type teamRowEnd = + ub.ai; // >= the row than the last nnz is in + + // team-collaborative copy of matrix data into scratch + A_size_type* rowEndsS{nullptr}; + A_ordinal_type* entriesS{nullptr}; + A_value_type* valuesS{nullptr}; + y_value_type* yS{nullptr}; + + if constexpr (ROWENDS_USE_SCRATCH) { + rowEndsS = (A_size_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_size_type)); + + // teamRowEnd may be equal to the row the team's last nnz is in + // so in most cases we want to read it (teamRowEnd+1). However, + // however, guard against reading off the end of the view + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1), + [&](const A_ordinal_type& i) { + if (i < A.numRows()) { + rowEndsS[i - teamRowBegin] = rowEnds(i); + } else { + rowEndsS[i - teamRowBegin] = A.nnz(); + } + }); + } else { + (void)(rowEndsS == rowEndsS); // set but unused, expr has no effect + } + + if constexpr (NONZEROS_USE_SCRATCH) { + valuesS = (A_value_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_value_type)); + entriesS = (A_ordinal_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_ordinal_type)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamNnzBegin, teamNnzEnd), + [=](const A_ordinal_type& i) { + valuesS[i - teamNnzBegin] = A.values(i); + entriesS[i - teamNnzBegin] = A.graph.entries(i); + }); + } else { + (void)(entriesS == entriesS); // set but unused, expr has no effect + (void)(valuesS == valuesS); // set but unused, expr has no effect + } + + if constexpr (Y_USE_SCRATCH) { + yS = (y_value_type*)thread.team_shmem().get_shmem(pathLengthTeamChunk * + sizeof(y_value_type)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1), + [&](const A_ordinal_type& i) { + if (i < A.numRows()) { + yS[i - teamRowBegin] = 0; + } + }); + } else { + (void)(yS == yS); // set but unused, expr has no effect + } + + if constexpr (ROWENDS_USE_SCRATCH || NONZEROS_USE_SCRATCH || + Y_USE_SCRATCH) { + thread.team_barrier(); + } + + // each thread determines its location within the team chunk + + // team's view of row map is either in scratch or global + typename std::conditional::type teamRowEnds; + if constexpr (ROWENDS_USE_SCRATCH) { + teamRowEnds = row_map_scratch_type(rowEndsS, teamRowEnd - teamRowBegin); + } else { + teamRowEnds = + um_row_map_type(&rowEnds(teamRowBegin), teamRowEnd - teamRowBegin); + } + + iota_type teamIota(teamNnzEnd - teamNnzBegin, + teamNnzBegin); // teamNnzBegin.. teamRowBegin && i < teamRowEnd) { + y(i) += yS[i - teamRowBegin]; + } else { + Kokkos::atomic_add(&y(i), yS[i - teamRowBegin]); + } + } + }); + } + } + + size_t team_shmem_size(int teamSize) const { + const A_size_type pathLengthTeamChunk = pathLengthThreadChunk_ * teamSize; + (void)pathLengthTeamChunk; // silence declared but not referenced + size_t val = 0; + if constexpr (Y_USE_SCRATCH) { + val += sizeof(y_value_type) * pathLengthTeamChunk; + } + if constexpr (ROWENDS_USE_SCRATCH) { + val += sizeof(row_map_non_const_value_type) * pathLengthTeamChunk; + } + if constexpr (NONZEROS_USE_SCRATCH) { + val += sizeof(A_ordinal_type) * pathLengthTeamChunk; + val += sizeof(A_value_type) * pathLengthTeamChunk; + } + return val; + } + }; + + static void spmv(const char mode[], const y_value_type& alpha, + const AMatrix& A, const XVector& x, const y_value_type& beta, + const YVector& y) { + static_assert(XVector::rank == 1, ""); + static_assert(YVector::rank == 1, ""); + + KokkosBlas::scal(y, beta, y); + + /* determine launch parameters for different architectures + On architectures where there is a natural execution hierarchy with true + team scratch, we'll assign each team to use an appropriate amount of the + scratch. + On other architectures, just have each team do the maximal amount of work + to amortize the cost of the diagonal search + */ + const A_size_type pathLength = A.numRows() + A.nnz(); + A_size_type pathLengthThreadChunk; + int teamSize; + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + pathLengthThreadChunk = 4; + teamSize = 128; + } else { + teamSize = 1; + pathLengthThreadChunk = (pathLength + exec_space::concurrency() - 1) / + exec_space::concurrency(); + } + + const size_t pathLengthTeamChunk = pathLengthThreadChunk * teamSize; + const int leagueSize = + (pathLength + pathLengthTeamChunk - 1) / pathLengthTeamChunk; + + policy_type policy(exec_space(), leagueSize, teamSize); + + /* Currently: + On GPU, assume atomics are fast, so don't accumuate into scratch. + On CPU spaces, there's no real point to using scratch, just rely on the + memory hierarchy. Using scratch just increases the number of required + atomic operations + */ + if (KokkosSparse::NoTranspose[0] == mode[0]) { + constexpr bool CONJ = false; + using GpuOp = Functor; + using CpuOp = Functor; + using Op = typename std::conditional< + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + CpuOp>::type; + Op op(alpha, A, x, y, pathLengthThreadChunk); + Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); + } else if (KokkosSparse::Conjugate[0] == mode[0]) { + constexpr bool CONJ = true; + using GpuOp = Functor; + using CpuOp = Functor; + using Op = typename std::conditional< + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + CpuOp>::type; + Op op(alpha, A, x, y, pathLengthThreadChunk); + Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); + } else { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ + << "SpmvMergeHierarchical::spmv() called with unsupported mode " + << mode; + throw std::logic_error(ss.str()); + } + } +}; + +} // namespace KokkosSparse::Impl + +#endif // KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index c18018f54f..a582f18e40 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ +#include + #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas1_scal.hpp" @@ -923,8 +925,10 @@ static void spmv_struct_beta( dobeta, true>(exec, stencil_type, structure, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv_struct()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv_struct()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } @@ -1454,8 +1458,10 @@ static void spmv_alpha_beta_mv_struct( YVector, doalpha, dobeta, true>( exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv_struct()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp index b6301778a3..85c35c0044 100644 --- a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp +++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp @@ -26,7 +26,7 @@ #include #include "KokkosKernels_Iota.hpp" -#include "KokkosSparse_MergeMatrix.hpp" +#include "KokkosSparse_merge_matrix.hpp" namespace Test_Sparse_MergeMatrix { @@ -85,8 +85,7 @@ template void view_view_empty_empty() { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-view-empty-empty-a", 0); BView b("view-view-empty-empty-b", 0); @@ -102,8 +101,7 @@ template void view_view_full_empty() { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; size_t aNonzero = 5; AView a("view-view-full-empty-a", aNonzero); @@ -123,8 +121,7 @@ template void view_view_empty_full() { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-view-empty-full-a", 0); BView b = from_std_vec("view-view-empty-full-b", {0, 1, 2, 3}); @@ -284,10 +281,9 @@ std::tuple view_view_case_5() { */ template void view_view_full_full() { - using AView = Kokkos::View; - using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; using mmd_value_type = typename MMD::non_const_value_type; { @@ -377,8 +373,7 @@ template void view_iota_empty_empty() { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-iota-empty-empty-a", 0); BView b(0); @@ -394,8 +389,7 @@ template void view_iota_full_empty() { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; size_t aNonzero = 5; AView a("view-iota-full-empty-a", aNonzero); @@ -415,8 +409,7 @@ template void view_iota_empty_full() { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-iota-empty-full-a", 0); BView b(4); @@ -487,10 +480,9 @@ std::tuple view_iota_case_1() { */ template void view_iota_full_full() { - using AView = Kokkos::View; - using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; using mmd_value_type = typename MMD::non_const_value_type; { @@ -537,8 +529,7 @@ void test_rank() { { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; static_assert(MMD::rank == 1, "MergeMatrixDiagonal should look like a rank-1 view"); } @@ -546,8 +537,7 @@ void test_rank() { { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; static_assert(MMD::rank == 1, "MergeMatrixDiagonal should look like a rank-1 view"); } diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index b6a64e4f6d..9f980fedc9 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -125,7 +125,7 @@ template void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, - char mode = 'N') { + const std::string &mode = "N") { using graph_t = typename crsMat_t::StaticCrsGraphType; using size_type_view_t = typename graph_t::row_map_type; using lno_view_t = typename graph_t::entries_type; @@ -136,8 +136,6 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using scalar_t = typename scalar_view_t::non_const_value_type; using KAT = Kokkos::ArithTraits; - mode = toupper(mode); - typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(input_mat.values); Kokkos::deep_copy(h_values, input_mat.values); @@ -168,13 +166,13 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, for (size_type j = h_rowmap(row); j < h_rowmap(row + 1); ++j) { lno_t col = h_entries(j); scalar_t val = h_values(j); - if (mode == 'N') + if (mode == "N") h_y(row) += alpha * val * h_x(col); - else if (mode == 'C') + else if (mode == "C") h_y(row) += alpha * KAT::conj(val) * h_x(col); - else if (mode == 'T') + else if (mode == "T") h_y(col) += alpha * val * h_x(row); - else if (mode == 'H') + else if (mode == "H") h_y(col) += alpha * KAT::conj(val) * h_x(row); } } @@ -184,12 +182,14 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv( - const Controls &controls, crsMat_t input_mat, x_vector_type x, - y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, char mode, + const KokkosKernels::Experimental::Controls &controls, crsMat_t input_mat, + x_vector_type x, y_vector_type y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, const std::string &mode, typename Kokkos::ArithTraits::mag_type max_val) { - // typedef typename crsMat_t::StaticCrsGraphType graph_t; + EXPECT_TRUE(mode.size() == 1); + using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -198,7 +198,7 @@ void check_spmv( const y_value_mag_type eps = 10 * Kokkos::ArithTraits::eps(); - bool transposed = (mode == 'T') || (mode == 'H'); + bool transposed = (mode == "T") || (mode == "H"); y_vector_type expected_y( "expected", transposed ? input_mat.numCols() : input_mat.numRows()); Kokkos::deep_copy(expected_y, y); @@ -208,7 +208,7 @@ void check_spmv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(controls, &mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(controls, mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -234,9 +234,12 @@ void check_spmv_mv( crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV, char mode, + typename y_vector_type::non_const_value_type beta, int numMV, + const std::string &mode, typename Kokkos::ArithTraits::mag_type max_val) { + EXPECT_TRUE(mode.size() == 1); + using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -256,7 +259,7 @@ void check_spmv_mv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -491,12 +494,12 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, Kokkos::fill_random(input_mat.values, rand_pool, randomUpperBound(max_val)); - std::vector nonTransModes = {'N'}; - std::vector transModes = {'T'}; - std::vector testAlphaBeta = {0.0, 1.0}; + std::vector nonTransModes = {"N"}; + std::vector transModes = {"T"}; + std::vector testAlphaBeta = {0.0, 1.0}; if (heavy) { - nonTransModes.push_back('C'); - transModes.push_back('H'); + nonTransModes.push_back("C"); + transModes.push_back("H"); testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } @@ -528,17 +531,23 @@ template ( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } { - Controls controls; + KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "native"); test_spmv( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } + { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "merge"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } } template nonTransModes = {'N'}; - std::vector transModes = {'T'}; - std::vector testAlphaBeta = {0.0, 1.0}; + std::vector nonTransModes = {"N"}; + std::vector transModes = {"T"}; + std::vector testAlphaBeta = {0.0, 1.0}; if (heavy) { - nonTransModes.push_back('C'); - transModes.push_back('H'); + nonTransModes.push_back("C"); + transModes.push_back("H"); testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } @@ -662,18 +671,18 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::deep_copy(b_y_copy, b_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "N", max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "N", max_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, "N", max_y + max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "T", max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "T", max_y); // Testing all modes together, since matrix is square - std::vector modes = {'N', 'C', 'T', 'H'}; + std::vector modes = {"N", "C", "T", "H"}; std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; for (auto mode : modes) { for (double alpha : testAlphaBeta) { From 245216841087370869d6f2d77b5a2f77e21e2dec Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 24 Jul 2023 10:34:43 -0600 Subject: [PATCH 207/231] KokkosKernels_Controls: add missing include Also remove pointless warning to stderr --- sparse/src/KokkosKernels_Controls.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp index 1ee8cd108e..594df031a3 100644 --- a/sparse/src/KokkosKernels_Controls.hpp +++ b/sparse/src/KokkosKernels_Controls.hpp @@ -20,6 +20,7 @@ /// \brief Mechanism to control internal behavior of kernels /// \author Luc Berger-Vergiat (lberge@sandia.gov) +#include #include #include #include "KokkosKernels_config.h" From b2d22caadfd0746df2f3a15cc1db41e74847c9ee Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 26 Jul 2023 15:22:43 -0600 Subject: [PATCH 208/231] Test_Sparse_spmv.hpp: more explicit namespacing --- sparse/unit_test/Test_Sparse_spmv.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 9f980fedc9..8fdb56b5f4 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -452,8 +452,9 @@ Kokkos::complex randomUpperBound>(int mag) { template -void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, - lno_t bandwidth, lno_t row_size_variance, bool heavy) { +void test_spmv(const KokkosKernels::Experimental::Controls &controls, + lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, bool heavy) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -953,7 +954,8 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { template void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, - const Controls &controls = Controls()) { + const KokkosKernels::Experimental::Controls &controls = + KokkosKernels::Experimental::Controls()) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -996,7 +998,7 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, template void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { - Controls controls; + KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "native"); test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls); } // test_spmv_native From a373631a6792176f38f10b001fa7e8c9f620050f Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 28 Aug 2023 11:57:13 -0600 Subject: [PATCH 209/231] add Execution Space support for merge-based SpMV --- sparse/impl/KokkosSparse_spmv_impl.hpp | 16 ++++++++-------- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 11 ++++++----- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 16717c6e62..c3df57c65f 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -635,19 +635,19 @@ static void spmv_beta(const execution_space& exec, const YVector& y) { if (mode[0] == NoTranspose[0]) { if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { - SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, - beta, y); + SpmvMergeHierarchical::spmv( + exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose( - exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { - SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, - beta, y); + SpmvMergeHierarchical::spmv( + exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose( - exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); } } else if (mode[0] == Transpose[0]) { spmv_beta_transpose +template struct SpmvMergeHierarchical { using device_type = typename YVector::device_type; - using exec_space = typename device_type::execution_space; + using exec_space = ExecutionSpace; using y_value_type = typename YVector::non_const_value_type; using x_value_type = typename XVector::non_const_value_type; using A_value_type = typename AMatrix::non_const_value_type; @@ -301,8 +301,9 @@ struct SpmvMergeHierarchical { } }; - static void spmv(const char mode[], const y_value_type& alpha, - const AMatrix& A, const XVector& x, const y_value_type& beta, + static void spmv(const ExecutionSpace& space, const char mode[], + const y_value_type& alpha, const AMatrix& A, + const XVector& x, const y_value_type& beta, const YVector& y) { static_assert(XVector::rank == 1, ""); static_assert(YVector::rank == 1, ""); @@ -332,7 +333,7 @@ struct SpmvMergeHierarchical { const int leagueSize = (pathLength + pathLengthTeamChunk - 1) / pathLengthTeamChunk; - policy_type policy(exec_space(), leagueSize, teamSize); + policy_type policy(space, leagueSize, teamSize); /* Currently: On GPU, assume atomics are fast, so don't accumuate into scratch. From 0c4a2a97e58aa8316277fe415bc424d7fc1d2698 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 19 Sep 2023 11:15:03 -0600 Subject: [PATCH 210/231] exec_space::concurrency -> exec_space().concurrency() --- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index 07d4aa68e1..f1e9df66bc 100644 --- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -320,13 +320,13 @@ struct SpmvMergeHierarchical { const A_size_type pathLength = A.numRows() + A.nnz(); A_size_type pathLengthThreadChunk; int teamSize; - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { pathLengthThreadChunk = 4; teamSize = 128; } else { teamSize = 1; - pathLengthThreadChunk = (pathLength + exec_space::concurrency() - 1) / - exec_space::concurrency(); + pathLengthThreadChunk = (pathLength + exec_space().concurrency() - 1) / + exec_space().concurrency(); } const size_t pathLengthTeamChunk = pathLengthThreadChunk * teamSize; @@ -346,7 +346,7 @@ struct SpmvMergeHierarchical { using GpuOp = Functor; using CpuOp = Functor; using Op = typename std::conditional< - KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); @@ -355,7 +355,7 @@ struct SpmvMergeHierarchical { using GpuOp = Functor; using CpuOp = Functor; using Op = typename std::conditional< - KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); From aeb9bf1654e2b370f5b5e621c41fb769ba8e7934 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 09:23:53 -0600 Subject: [PATCH 211/231] KokkosSparse_spmv_impl_merge.hpp: reduce chance of identifier collision --- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index f1e9df66bc..9329b8a097 100644 --- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -73,9 +73,10 @@ struct SpmvMergeHierarchical { template - struct Functor { - Functor(const y_value_type& _alpha, const AMatrix& _A, const XVector& _x, - const YVector& _y, const A_size_type pathLengthThreadChunk) + struct SpmvMergeImplFunctor { + SpmvMergeImplFunctor(const y_value_type& _alpha, const AMatrix& _A, + const XVector& _x, const YVector& _y, + const A_size_type pathLengthThreadChunk) : alpha(_alpha), A(_A), x(_x), @@ -299,7 +300,7 @@ struct SpmvMergeHierarchical { } return val; } - }; + }; // struct SpmvMergeImplFunctor static void spmv(const ExecutionSpace& space, const char mode[], const y_value_type& alpha, const AMatrix& A, @@ -343,8 +344,8 @@ struct SpmvMergeHierarchical { */ if (KokkosSparse::NoTranspose[0] == mode[0]) { constexpr bool CONJ = false; - using GpuOp = Functor; - using CpuOp = Functor; + using GpuOp = SpmvMergeImplFunctor; + using CpuOp = SpmvMergeImplFunctor; using Op = typename std::conditional< KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; @@ -352,8 +353,8 @@ struct SpmvMergeHierarchical { Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); } else if (KokkosSparse::Conjugate[0] == mode[0]) { constexpr bool CONJ = true; - using GpuOp = Functor; - using CpuOp = Functor; + using GpuOp = SpmvMergeImplFunctor; + using CpuOp = SpmvMergeImplFunctor; using Op = typename std::conditional< KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; From 2a95f5e030aaeea7a231d09a14647c05a242d13b Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 09:24:22 -0600 Subject: [PATCH 212/231] KokkosSparse_merge_matrix.hpp: compare with size_type --- sparse/impl/KokkosSparse_merge_matrix.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_merge_matrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp index 83dfe42e84..6510975c87 100644 --- a/sparse/impl/KokkosSparse_merge_matrix.hpp +++ b/sparse/impl/KokkosSparse_merge_matrix.hpp @@ -145,9 +145,9 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION bool operator()(const size_type di) const { position_type pos = diag_to_a_b(di); - if (size_t(pos.ai) >= a_.size()) { + if (size_type(pos.ai) >= a_.size()) { return true; // on the +a side out of matrix bounds is 1 - } else if (size_t(pos.bi) >= b_.size()) { + } else if (size_type(pos.bi) >= b_.size()) { return false; // on the +b side out of matrix bounds is 0 } else { return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); From 09d442da94859df278618efd9b7168637e5052ad Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 11:14:11 -0600 Subject: [PATCH 213/231] KokkosSparse_spmv.hpp: 4.0 ::rank handling --- sparse/src/KokkosSparse_spmv.hpp | 34 ++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 2aab1cef60..0658adbccf 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -98,13 +98,23 @@ void spmv(const ExecutionSpace& space, typename YVector::memory_space>::accessible, "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that x and y have the same rank. - static_assert(XVector::rank == YVector::rank, +// Make sure that x and y have the same rank. +// Make sure that x (and therefore y) is rank 1. +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), "KokkosSparse::spmv: Vector ranks do not match."); - // Make sure that x (and therefore y) is rank 1. + + static_assert(XVector::rank() == 1, + "KokkosSparse::spmv: Both Vector inputs must have rank 1 " + "in order to call this specialization of spmv."); +#else + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " "in order to call this specialization of spmv."); +#endif // Make sure that y is non-const. static_assert(std::is_same::value, @@ -296,8 +306,14 @@ void spmv(const ExecutionSpace& space, typename YVector::memory_space>::accessible, "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that x and y have the same rank. - static_assert(XVector::rank == YVector::rank, +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), "KokkosSparse::spmv: Vector ranks do not match."); +#else + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); +#endif // Make sure that x (and therefore y) is rank 1. static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " @@ -673,9 +689,15 @@ void spmv(const ExecutionSpace& space, Kokkos::SpaceAccessibility::accessible, "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that x and y have the same rank. - static_assert(XVector::rank == YVector::rank, +// Make sure that x and y have the same rank. +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), "KokkosSparse::spmv: Vector ranks do not match."); +#else + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); +#endif // Make sure that x (and therefore y) is rank 2. static_assert(static_cast(XVector::rank) == 2, "KokkosSparse::spmv: Both Vector inputs must have rank 2 " From b8eb1bcb31c1fb73d94fe2269e728cb7a5d94f02 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 11:37:00 -0600 Subject: [PATCH 214/231] KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp: used provided execution space for TPL --- sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 8932beb88a..97019e4682 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -819,6 +819,8 @@ void spmv_block_impl_rocsparse( "A entries must be contiguous"); rocsparse_handle handle = controls.getRocsparseHandle(); + // resets handle stream to NULL when out of scope + KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(handle, exec); // set the mode rocsparse_operation trans; From 01e97739b682690ef97f68cefd5b64b0b29f70b2 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 16:30:59 -0600 Subject: [PATCH 215/231] KokkosSparse_merge_matrix.hpp: fix comparison signedness --- sparse/impl/KokkosSparse_merge_matrix.hpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sparse/impl/KokkosSparse_merge_matrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp index 6510975c87..18c9467a9a 100644 --- a/sparse/impl/KokkosSparse_merge_matrix.hpp +++ b/sparse/impl/KokkosSparse_merge_matrix.hpp @@ -31,6 +31,9 @@ namespace KokkosSparse::Impl { // a joint index into a and b template struct MergeMatrixPosition { + using a_index_type = AIndex; + using b_index_type = BIndex; + AIndex ai; BIndex bi; }; @@ -145,9 +148,10 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION bool operator()(const size_type di) const { position_type pos = diag_to_a_b(di); - if (size_type(pos.ai) >= a_.size()) { + + if (pos.ai >= typename position_type::a_index_type(a_.size())) { return true; // on the +a side out of matrix bounds is 1 - } else if (size_type(pos.bi) >= b_.size()) { + } else if (pos.bi >= typename position_type::b_index_type(b_.size())) { return false; // on the +b side out of matrix bounds is 0 } else { return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); @@ -161,9 +165,9 @@ class MergeMatrixDiagonal { */ KOKKOS_INLINE_FUNCTION size_type size() const noexcept { - if (d_ <= a_.size() && d_ <= b_.size()) { + if (d_ <= size_type(a_.size()) && d_ <= size_type(b_.size())) { return d_; - } else if (d_ > a_.size() && d_ > b_.size()) { + } else if (d_ > size_type(a_.size()) && d_ > size_type(b_.size())) { // TODO: this returns nonsense if d_ happens to be outside the merge // matrix return a_.size() + b_.size() - d_; @@ -182,8 +186,8 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION position_type diag_to_a_b(const size_type &di) const noexcept { position_type res; - res.ai = d_ < a_.size() ? (d_ - 1) - di : a_.size() - 1 - di; - res.bi = d_ < a_.size() ? di : d_ + di - a_.size(); + res.ai = d_ < size_type(a_.size()) ? (d_ - 1) - di : a_.size() - 1 - di; + res.bi = d_ < size_type(a_.size()) ? di : d_ + di - a_.size(); return res; } From 0ca98cabd14958330dea7e0225c24b2177bdef30 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 16 Oct 2023 17:45:10 -0600 Subject: [PATCH 216/231] BSPGEMM: removing cusparse testing for version older than 11.4.0 Older version are either hanging or returning wrong results in our unit-tests, this is fixed with more recent versions of the TPL. --- sparse/unit_test/Test_Sparse_bspgemm.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sparse/unit_test/Test_Sparse_bspgemm.hpp b/sparse/unit_test/Test_Sparse_bspgemm.hpp index 58a2a18b8a..d3c3a6134f 100644 --- a/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -159,6 +159,15 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, return; } #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUSPARSE_VERSION < 11600) + { + std::cerr + << "TEST SKIPPED: See " + "https://github.com/kokkos/kokkos-kernels/issues/1965 for details." + << std::endl; + return; + } +#endif using namespace Test; // device::execution_space::initialize(); // device::execution_space::print_configuration(std::cout); From 40a0d65abbf2a9078a84c8f21c5bdb46bd4ab90e Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 17 Oct 2023 15:09:46 -0600 Subject: [PATCH 217/231] Remove /etc/profile.d/modules.sh from cm_test_all_sandia for blake --- scripts/cm_test_all_sandia | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index f939060320..28ef93b004 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -731,7 +731,6 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then ARCH_FLAG="--arch=VEGA90A" fi elif [ "$MACHINE" = "blake" ]; then - MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 From a9e3ccf70ccb393afb54cab755bbda19f282c76c Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 17 Oct 2023 21:13:09 -0600 Subject: [PATCH 218/231] CMakeLists.txt: Update Kokkos version to 4.2.99 for version check Update corresponding to kokkos/kokkos#6520 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b37322e90..f03a44ae04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,10 +126,10 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - IF((${Kokkos_VERSION} VERSION_EQUAL "4.0.1") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.99")) + IF((${Kokkos_VERSION} VERSION_EQUAL "4.0.1") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.99") OR (${Kokkos_VERSION} VERSION_EQUAL "4.2.99")) MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.0.1, 4.1.00 or develop (4.1.99)") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.0.1, 4.1.00, 4.1.99 or develop (4.2.99)") ENDIF() ENDIF() From 002cd2bfb0f6e3bc8532a469e002eb8eabb6a628 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 18 Oct 2023 08:26:19 -0600 Subject: [PATCH 219/231] iostream clean-up in benchmarks --- perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp | 2 ++ perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index 1e537ceadc..fca3030763 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -42,6 +42,8 @@ //@HEADER */ +#include + #include #include diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index 14957994d1..c03cbb12ad 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -42,6 +42,8 @@ //@HEADER */ +#include + #include #include From 3846ff9bcaaacc0665a6b0a403c1edf240d6d525 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 18 Oct 2023 12:07:45 -0600 Subject: [PATCH 220/231] CMakeLists.txt: Update version to 4.2.99 Update Kokkos version check to include 4.2.00 (upcoming release) --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f03a44ae04..2a8cc04476 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 1) +SET(KokkosKernels_VERSION_MINOR 2) SET(KokkosKernels_VERSION_PATCH 99) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") @@ -126,10 +126,10 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - IF((${Kokkos_VERSION} VERSION_EQUAL "4.0.1") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.99") OR (${Kokkos_VERSION} VERSION_EQUAL "4.2.99")) + IF((${Kokkos_VERSION} VERSION_EQUAL "4.0.1") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.99") OR (${Kokkos_VERSION} VERSION_EQUAL "4.2.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.2.99")) MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.0.1, 4.1.00, 4.1.99 or develop (4.2.99)") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.0.1, 4.1.00, 4.1.99, 4.2.00 or develop (4.2.99)") ENDIF() ENDIF() From 3db693b6a6f658db96e9ddac57f53a2ef3c72ed3 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 18 Oct 2023 13:38:38 -0600 Subject: [PATCH 221/231] CMakeLists.txt: Revise Kokkos_VERSION check Revsions suggested by @lucbv - Allow versions greater than or equal to existing release - If Kokkos_VERSION is greater than expected develop branch version, add warning to update the version check --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a8cc04476..893e4239cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,10 +126,13 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - IF((${Kokkos_VERSION} VERSION_EQUAL "4.0.1") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.1.99") OR (${Kokkos_VERSION} VERSION_EQUAL "4.2.00") OR (${Kokkos_VERSION} VERSION_EQUAL "4.2.99")) + IF((${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.2.00")) MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") + IF((${Kokkos_VERSION} VERSION_GREATER "4.2.99")) + MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") + ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.0.1, 4.1.00, 4.1.99, 4.2.00 or develop (4.2.99)") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.1.00, 4.2.00 or develop") ENDIF() ENDIF() From 7f70eb36dd50e1806092b75be63a52231a0ae5f8 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 18 Oct 2023 12:28:46 -0600 Subject: [PATCH 222/231] CMakeLists.txt: Update version to 4.2.00 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 893e4239cd..79aaa7d380 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) SET(KokkosKernels_VERSION_MINOR 2) -SET(KokkosKernels_VERSION_PATCH 99) +SET(KokkosKernels_VERSION_PATCH 00) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file From d220081a080368cd9dbd737881423c7b884ce4eb Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 19 Oct 2023 07:49:42 -0600 Subject: [PATCH 223/231] Merge pull request #2001 from jgfouca/jgfouca/fix_par_ilut_docs par_ilut: Update documentation for fill_in_limit (cherry picked from commit ab0a32d81ca0a6716d450b721ce5de020fcbb442) --- sparse/src/KokkosSparse_par_ilut_handle.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 3ffe44ffca..5ea4b3c436 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -78,7 +78,13 @@ class PAR_ILUTHandle { /// iteration to iteration drops below /// this, the algorithm will stop (even if /// max_iters has not been hit) - float_t fill_in_limit; /// The threshold for the ILU factorization + float_t fill_in_limit; /// The threshold for removing candidates + /// from the intermediate L and U is set such + /// that the resulting sparsity pattern has + /// at most `fill_in_limit` times the number + /// of non-zeros of the ILU(0) + /// factorization. This selection is executed + /// separately for both factors L and U. bool async_update; /// Whether compute LU factors should do asychronous /// updates. When ON, the algorithm will usually converge /// faster but it makes the algorithm non-deterministic. From 529a56eafdb57a894890fdb89e49dd4c556626a6 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 19 Oct 2023 07:49:50 -0600 Subject: [PATCH 224/231] Merge pull request #2007 from lucbv/bhalf_t_fix bhalf_t fix for isnan function (cherry picked from commit b9f3d781f65d777136deb2cc7ff9038fbe9b9c9f) --- common/src/Kokkos_ArithTraits.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 17296185e7..75c0951e10 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -1336,9 +1336,13 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } +#else +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ARITHTRAITS_HALF_FP(KOKKOS_FUNCTION) #else KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) #endif +#endif }; #endif // #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT From 06b314c9720d6d7d17c6e9eda422654d8c6d0de7 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 19 Oct 2023 07:50:02 -0600 Subject: [PATCH 225/231] Merge pull request #1999 from lucbv/experimental_hip_cleanup Experimental hip cleanup (cherry picked from commit d9a67b9f6ee591ba12ca70415e74694847379d43) --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 3 +- batched/dense/src/KokkosBatched_Vector.hpp | 20 +- blas/impl/KokkosBlas3_gemm_impl.hpp | 3 +- blas/impl/KokkosBlas3_gemm_spec.hpp | 2 +- blas/src/KokkosBlas2_gemv.hpp | 10 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 31 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 430 +++++++++--------- blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 20 +- cmake/KokkosKernels_config.h.in | 4 +- cmake/kokkoskernels_eti_devices.cmake | 8 +- common/src/KokkosKernels_ExecSpaceUtils.hpp | 25 +- common/src/KokkosKernels_default_types.hpp | 2 +- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 6 +- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 5 +- .../blas1/KokkosBlas_dot_mv_perf_test.cpp | 2 +- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 2 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 6 +- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 2 +- ...s3_gemm_standalone_perf_test_benchmark.cpp | 2 +- perf_test/graph/KokkosGraph_color.cpp | 4 +- perf_test/graph/KokkosGraph_color_d2.cpp | 3 +- perf_test/graph/KokkosGraph_mis_d2.cpp | 2 +- perf_test/sparse/KokkosSparse_pcg.cpp | 3 +- sparse/impl/KokkosSparse_spmv_impl.hpp | 6 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 4 +- sparse/src/KokkosSparse_CrsMatrix.hpp | 2 +- sparse/src/KokkosSparse_spgemm_handle.hpp | 2 +- sparse/src/KokkosSparse_spmv.hpp | 2 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 35 +- test_common/Test_HIP.hpp | 2 +- 30 files changed, 301 insertions(+), 347 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index f413ba612c..f70fa6b963 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -49,8 +49,7 @@ constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { // buffering algorithm by a factor of 2. #if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) template <> -constexpr KOKKOS_INLINE_FUNCTION int -kk_gemm_dbl_buf_tile_k() { +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { return 16; } #endif diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index 23fd62655a..71d159cb03 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -120,21 +120,19 @@ struct DefaultVectorLength, Kokkos::CudaUVMSpace> { #if defined(KOKKOS_ENABLE_HIP) template <> -struct DefaultVectorLength { +struct DefaultVectorLength { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength { +struct DefaultVectorLength { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultVectorLength, Kokkos::HIPSpace> { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultVectorLength, Kokkos::HIPSpace> { enum : int { value = 16 }; }; #endif @@ -189,21 +187,19 @@ struct DefaultInternalVectorLength, #if defined(KOKKOS_ENABLE_HIP) template <> -struct DefaultInternalVectorLength { +struct DefaultInternalVectorLength { enum : int { value = 8 }; }; template <> -struct DefaultInternalVectorLength { +struct DefaultInternalVectorLength { enum : int { value = 4 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultInternalVectorLength, Kokkos::HIPSpace> { enum : int { value = 4 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultInternalVectorLength, Kokkos::HIPSpace> { enum : int { value = 2 }; }; #endif diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index 4f3e62f343..1a0ab46bb3 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -49,8 +49,7 @@ struct impl_gemm_choose_copy_layout { #ifdef KOKKOS_ENABLE_HIP template -struct impl_gemm_choose_copy_layout { +struct impl_gemm_choose_copy_layout { using type = LayoutA; }; #endif diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index c340a41fc1..367a8dad3f 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -192,7 +192,7 @@ struct GEMM { team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 40ac9db249..614b48d47a 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -147,11 +147,11 @@ void gemv(const ExecutionSpace& space, const char trans[], #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS useFallback = - useFallback || (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback || + (tolower(*trans) == 'c' && + std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS useFallback = useFallback || (tolower(*trans) == 'c' && diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 1496eee020..70b5560f6e 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -127,23 +127,20 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 894ce884ee..304dd349bf 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -548,239 +548,219 @@ namespace Impl { transa = rocblas_operation_conjugate_transpose; \ } -#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ + X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ + X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ + s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ + s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 69146baf4f..8e96898b10 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -164,26 +164,22 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, }; KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutRight, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutRight, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 22b7a196fc..b8b66fffbb 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -49,14 +49,14 @@ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_CUDA #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE -/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +/* Whether to build kernels for execution space Kokkos::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE /* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE -/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +/* Whether to build kernels for execution space Kokkos::Experimental::OpenMPTarget */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMPTARGET #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_OPENMPTARGETSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index d223e00171..8c6cb540ae 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -12,7 +12,7 @@ SET(EXEC_SPACES EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) -SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) +SET(EXECSPACE_HIP_CPP_TYPE Kokkos::HIP) SET(EXECSPACE_SYCL_CPP_TYPE Kokkos::Experimental::SYCL) SET(EXECSPACE_OPENMPTARGET_CPP_TYPE Kokkos::Experimental::OpenMPTarget) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) @@ -31,7 +31,7 @@ SET(MEM_SPACES ) SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) -SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::HIPSpace) SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) SET(MEMSPACE_OPENMPTARGETSPACE_CPP_TYPE Kokkos::Experimental::OpenMPTargetSpace) @@ -77,13 +77,13 @@ IF(KOKKOS_ENABLE_HIP) INST_EXECSPACE_HIP ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the execution space Kokkos::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HIPSPACE ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the memory space Kokkos::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index eb629f9e0c..2ec09f4069 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -66,7 +66,7 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) { + if (std::is_same::value) { exec_space = Exec_HIP; } #endif @@ -98,8 +98,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { #ifdef KOKKOS_ENABLE_HIP template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_gpu_exec_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { return true; } #endif @@ -208,17 +207,17 @@ inline void kk_get_free_total_memory( #ifdef KOKKOS_ENABLE_HIP template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem, + int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); free_mem /= n_streams; total_mem /= n_streams; } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, total_mem, - 1); +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif @@ -405,13 +404,13 @@ struct SpaceInstance { #ifdef KOKKOS_ENABLE_HIP template <> -struct SpaceInstance { - static Kokkos::Experimental::HIP create() { +struct SpaceInstance { + static Kokkos::HIP create() { hipStream_t stream; KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); - return Kokkos::Experimental::HIP(stream); + return Kokkos::HIP(stream); } - static void destroy(Kokkos::Experimental::HIP& space) { + static void destroy(Kokkos::HIP& space) { hipStream_t stream = space.hip_stream(); KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream)); } diff --git a/common/src/KokkosKernels_default_types.hpp b/common/src/KokkosKernels_default_types.hpp index 30ca52e300..1da965a082 100644 --- a/common/src/KokkosKernels_default_types.hpp +++ b/common/src/KokkosKernels_default_types.hpp @@ -62,7 +62,7 @@ using default_scalar = double; #if defined(KOKKOS_ENABLE_CUDA) using default_device = Kokkos::Cuda; #elif defined(KOKKOS_ENABLE_HIP) -using default_device = Kokkos::Experimental::HIP; +using default_device = Kokkos::HIP; #elif defined(KOKKOS_ENABLE_OPENMPTARGET) using default_device = Kokkos::Experimental::OpenMPTarget; #elif defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index 314439b6c0..f3eb0dd8ac 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -117,8 +117,7 @@ struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct FactorizeModeAndAlgo - : FactorizeModeAndAlgoDeviceImpl {}; +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #endif template @@ -156,8 +155,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo - : SolveModeAndAlgoDeviceImpl {}; +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif template diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 3f15ca0b2d..67a141578e 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -127,7 +127,7 @@ struct InverseDiagonalsModeAndAlgo #if defined(KOKKOS_ENABLE_HIP) template <> -struct InverseDiagonalsModeAndAlgo +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif @@ -166,8 +166,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo - : SolveModeAndAlgoDeviceImpl {}; +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 499a701c13..7bc25a5704 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -210,7 +210,7 @@ int main(int argc, char** argv) { } if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) - run(params.m, params.n, params.repeat); + run(params.m, params.n, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index 89680d20f9..54ae35ac7a 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -207,7 +207,7 @@ int main(int argc, char** argv) { if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) - run(params.m, params.repeat); + run(params.m, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index 564db4af2e..5dfecd9015 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -211,11 +211,9 @@ int main(int argc, char** argv) { if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) if (params.layoutLeft) - run(params.m, params.n, - params.repeat); + run(params.m, params.n, params.repeat); else - run(params.m, params.n, - params.repeat); + run(params.m, params.n, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 962328eb95..8f25026ba9 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -188,7 +188,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - run(params); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index 32d91e6b33..d617ffcdf3 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -180,7 +180,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - run(params); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index 57f241d7b1..134611739a 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -632,8 +632,8 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { KokkosKernels::Experiment::run_multi_mem_experiment< - size_type, idx, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + size_type, idx, Kokkos::HIP, Kokkos::HIPSpace, Kokkos::HIPSpace>( + params); } #endif diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index f05040c083..e4331dd542 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -708,8 +708,7 @@ int main(int argc, char* argv[]) { if (params.use_hip) { if (!use_multi_mem) { KokkosKernels::Experiment::experiment_driver< - kk_size_type, kk_lno_t, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace>(params); + kk_size_type, kk_lno_t, Kokkos::HIP, Kokkos::HIPSpace>(params); } } #endif diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index a97cbb4d81..8f7d6a1983 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -316,7 +316,7 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { - run_mis2(params); + run_mis2(params); run = true; } #endif diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 475bfe5f85..9825f7c90d 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -366,8 +366,7 @@ int main(int argc, char **argv) { if (cmdline[CMD_USE_CUDA]) run_pcg(cmdline, mtx_file); #endif #if defined(KOKKOS_ENABLE_HIP) - if (cmdline[CMD_USE_HIP]) - run_pcg(cmdline, mtx_file); + if (cmdline[CMD_USE_HIP]) run_pcg(cmdline, mtx_file); #endif } Kokkos::finalize(); diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index c3df57c65f..060a9d66c7 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -205,8 +205,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) - max_vector_length = 64; + if (std::is_same::value) max_vector_length = 64; #endif if (vector_length < 1) { @@ -594,8 +593,7 @@ static void spmv_beta_transpose(const execution_space& exec, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) - max_vector_length = 64; + if (std::is_same::value) max_vector_length = 64; #endif while ((vector_length * 2 * 3 <= NNZPerRow) && (vector_length < max_vector_length)) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index b14c9be072..ee7e83b554 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2734,8 +2734,8 @@ struct ReturnRangePolicyType { #endif #ifdef KOKKOS_ENABLE_HIP template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; static inline PolicyType get_policy(int nt, int ts) { return PolicyType(nt, ts); diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index be3ac80343..7070172a1f 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -63,7 +63,7 @@ inline int RowsPerThread(const int /*NNZPerRow*/) { #endif #ifdef KOKKOS_ENABLE_HIP template <> -inline int RowsPerThread(const int /*NNZPerRow*/) { +inline int RowsPerThread(const int /*NNZPerRow*/) { return 1; } #endif diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 1106d300c8..a95c828c96 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -661,7 +661,7 @@ class SPGEMMHandle { #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) { + if (std::is_same::value) { this->algorithm_type = SPGEMM_KK; #ifdef VERBOSE std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 0658adbccf..bd038813d1 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -203,7 +203,7 @@ void spmv(const ExecutionSpace& space, #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE if (std::is_same::value) { + Kokkos::HIPSpace>::value) { useFallback = useFallback || (mode[0] != NoTranspose[0]); } #endif diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 1df4a7e5c9..01a0ce1373 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -183,25 +183,22 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ - template <> \ - struct spmv_tpl_spec_avail< \ - Kokkos::HIP, \ - KokkosSparse::CrsMatrix, \ - Kokkos::MemoryTraits, \ - const rocsparse_int>, \ - Kokkos::View< \ - const SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::HIP, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const rocsparse_int>, \ + Kokkos::View< \ + const SCALAR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(double, Kokkos::LayoutLeft) diff --git a/test_common/Test_HIP.hpp b/test_common/Test_HIP.hpp index 6d619d1378..c9e02698c5 100644 --- a/test_common/Test_HIP.hpp +++ b/test_common/Test_HIP.hpp @@ -32,6 +32,6 @@ class hip : public ::testing::Test { }; #define TestCategory hip -#define TestDevice Kokkos::Experimental::HIP +#define TestDevice Kokkos::HIP #endif // TEST_HIP_HPP From d215d0211b491fa707dae9f5307956776aa74641 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 27 Oct 2023 12:12:52 -0600 Subject: [PATCH 226/231] Merge pull request #2021 from cwpearson/fix/issue-2010 hide native merge-path SpMV behind "native-merge" (cherry picked from commit 89df0f92e3049d1b1d570f6ea1c1da1c9dec4568) --- sparse/impl/KokkosSparse_spmv_impl.hpp | 6 +++--- sparse/unit_test/Test_Sparse_spmv.hpp | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 060a9d66c7..4f90002a61 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -31,7 +31,7 @@ namespace KokkosSparse { namespace Impl { -constexpr const char* KOKKOSSPARSE_ALG_MERGE = "merge"; +constexpr const char* KOKKOSSPARSE_ALG_NATIVE_MERGE = "native-merge"; // This TransposeFunctor is functional, but not necessarily performant. template ::spmv( exec, mode, alpha, A, x, beta, y); } else { @@ -640,7 +640,7 @@ static void spmv_beta(const execution_space& exec, false>(exec, controls, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { - if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 8fdb56b5f4..990fcc1a30 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -549,6 +549,12 @@ void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth, test_spmv( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } + { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "native-merge"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } } template Date: Thu, 26 Oct 2023 11:07:18 -0600 Subject: [PATCH 227/231] Merge pull request #2020 from ndellingwood/export-all_libs-add-aliases KokkosKernelsConfig.cmake: add all_libs target and necessary aliases (cherry picked from commit 2c70f24216f9890abb2b23e76c84c935fa634bc9) --- cmake/KokkosKernelsConfig.cmake.in | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index fbceffe76c..9b649d26c6 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -11,3 +11,13 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) INCLUDE("${KokkosKernels_CMAKE_DIR}/KokkosKernelsTargets.cmake") +IF(NOT TARGET KokkosKernels::all_libs) + # CMake Error at /lib/cmake/Kokkos/KokkosConfigCommon.cmake:10 (ADD_LIBRARY): + # ADD_LIBRARY cannot create ALIAS target "Kokkos::all_libs" because target + # "KokkosKernels::kokkoskernels" is imported but not globally visible. + IF(CMAKE_VERSION VERSION_LESS "3.18") + SET_TARGET_PROPERTIES(Kokkos::kokkoskernels PROPERTIES IMPORTED_GLOBAL ON) + ENDIF() + ADD_LIBRARY(KokkosKernels::all_libs ALIAS Kokkos::kokkoskernels) + ADD_LIBRARY(KokkosKernels::kokkoskernels ALIAS Kokkos::kokkoskernels) +ENDIF() From e295b17cc6fc7d4216984bbb84ba932ac19241f5 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 26 Oct 2023 08:54:02 -0600 Subject: [PATCH 228/231] Merge pull request #1985 from eeprude/lapackDir Creation of the 'lapack' subdirectory, parallel to 'blas' (cherry picked from commit e7b6c1230c09368b6ceaf01fdb212fcdbdb94518) --- CMakeLists.txt | 10 +- blas/CMakeLists.txt | 14 - blas/src/KokkosBlas_gesv.hpp | 101 +------ blas/src/KokkosBlas_trtri.hpp | 74 +---- blas/tpls/KokkosBlas_Host_tpl.cpp | 97 ------- blas/tpls/KokkosBlas_Host_tpl.hpp | 6 - blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp | 107 ------- blas/unit_test/Test_Blas.hpp | 3 - cmake/KokkosKernels_config.h.in | 2 + cmake/kokkoskernels_components.cmake | 10 + lapack/CMakeLists.txt | 67 +++++ .../KokkosLapack_gesv_eti_spec_inst.cpp.in | 6 +- .../KokkosLapack_trtri_eti_spec_inst.cpp.in | 6 +- .../KokkosLapack_gesv_eti_spec_avail.hpp.in | 8 +- .../KokkosLapack_trtri_eti_spec_avail.hpp.in | 12 +- .../impl/KokkosLapack_gesv_impl.hpp | 14 +- .../impl/KokkosLapack_gesv_spec.hpp | 106 +++---- .../impl/KokkosLapack_trtri_impl.hpp | 12 +- .../impl/KokkosLapack_trtri_spec.hpp | 56 ++-- lapack/src/KokkosLapack_gesv.hpp | 151 ++++++++++ lapack/src/KokkosLapack_trtri.hpp | 119 ++++++++ lapack/tpls/KokkosLapack_Cuda_tpl.cpp | 18 ++ lapack/tpls/KokkosLapack_Cuda_tpl.hpp | 64 ++++ lapack/tpls/KokkosLapack_Host_tpl.cpp | 152 ++++++++++ lapack/tpls/KokkosLapack_Host_tpl.hpp | 44 +++ .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 64 ++-- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 214 +++++++------- .../KokkosLapack_trtri_tpl_spec_avail.hpp | 133 +++++++++ .../tpls/KokkosLapack_trtri_tpl_spec_decl.hpp | 274 ++++++++++-------- lapack/unit_test/CMakeLists.txt | 94 ++++++ lapack/unit_test/Test_Lapack.hpp | 22 ++ .../unit_test/Test_Lapack_gesv.hpp | 55 ++-- .../unit_test/Test_Lapack_trtri.hpp | 26 +- .../unit_test/backends/Test_Cuda_Lapack.cpp | 22 ++ lapack/unit_test/backends/Test_HIP_Lapack.cpp | 22 ++ .../unit_test/backends/Test_OpenMP_Lapack.cpp | 22 ++ .../unit_test/backends/Test_Serial_Lapack.cpp | 22 ++ .../backends/Test_Threads_Lapack.cpp | 22 ++ .../blas/blas3/KokkosBlas_trtri_perf_test.hpp | 8 +- sparse/src/KokkosSparse_sptrsv_supernode.hpp | 6 +- 40 files changed, 1451 insertions(+), 814 deletions(-) delete mode 100644 blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp create mode 100644 lapack/CMakeLists.txt rename blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in => lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in (88%) rename blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in => lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in (88%) rename blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in => lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in (80%) rename blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in => lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in (73%) rename blas/impl/KokkosBlas_gesv_impl.hpp => lapack/impl/KokkosLapack_gesv_impl.hpp (73%) rename blas/impl/KokkosBlas_gesv_spec.hpp => lapack/impl/KokkosLapack_gesv_spec.hpp (74%) rename blas/impl/KokkosBlas_trtri_impl.hpp => lapack/impl/KokkosLapack_trtri_impl.hpp (91%) rename blas/impl/KokkosBlas_trtri_spec.hpp => lapack/impl/KokkosLapack_trtri_spec.hpp (77%) create mode 100644 lapack/src/KokkosLapack_gesv.hpp create mode 100644 lapack/src/KokkosLapack_trtri.hpp create mode 100644 lapack/tpls/KokkosLapack_Cuda_tpl.cpp create mode 100644 lapack/tpls/KokkosLapack_Cuda_tpl.hpp create mode 100644 lapack/tpls/KokkosLapack_Host_tpl.cpp create mode 100644 lapack/tpls/KokkosLapack_Host_tpl.hpp rename blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp => lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp (60%) rename blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp => lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp (87%) create mode 100644 lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp rename blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp => lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp (50%) create mode 100644 lapack/unit_test/CMakeLists.txt create mode 100644 lapack/unit_test/Test_Lapack.hpp rename blas/unit_test/Test_Blas_gesv.hpp => lapack/unit_test/Test_Lapack_gesv.hpp (89%) rename blas/unit_test/Test_Blas_trtri.hpp => lapack/unit_test/Test_Lapack_trtri.hpp (94%) create mode 100644 lapack/unit_test/backends/Test_Cuda_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_HIP_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_OpenMP_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_Serial_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_Threads_Lapack.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 79aaa7d380..8e990cece5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,7 @@ IF (KokkosKernels_INSTALL_TESTING) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(batched/dense/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(batched/sparse/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(ode/unit_test) @@ -192,7 +193,7 @@ ELSE() "ALL" STRING "A list of components to enable in testing and building" - VALID_ENTRIES BATCHED BLAS GRAPH SPARSE ALL + VALID_ENTRIES BATCHED BLAS LAPACK GRAPH SPARSE ALL ) # ================================================================== @@ -243,6 +244,7 @@ ELSE() MESSAGE(" COMMON: ON") MESSAGE(" BATCHED: ${KokkosKernels_ENABLE_COMPONENT_BATCHED}") MESSAGE(" BLAS: ${KokkosKernels_ENABLE_COMPONENT_BLAS}") + MESSAGE(" LAPACK: ${KokkosKernels_ENABLE_COMPONENT_LAPACK}") MESSAGE(" GRAPH: ${KokkosKernels_ENABLE_COMPONENT_GRAPH}") MESSAGE(" SPARSE: ${KokkosKernels_ENABLE_COMPONENT_SPARSE}") MESSAGE(" ODE: ${KokkosKernels_ENABLE_COMPONENT_ODE}") @@ -287,6 +289,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_BLAS) INCLUDE(blas/CMakeLists.txt) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) + INCLUDE(lapack/CMakeLists.txt) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) INCLUDE(graph/CMakeLists.txt) ENDIF() @@ -405,6 +410,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_BLAS) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) ENDIF() diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index d6ce98dae9..869b152e7b 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -101,13 +101,6 @@ KOKKOSKERNELS_GENERATE_ETI(Blas1_dot_mv dot TYPE_LISTS FLOATS LAYOUTS DEVICES ) -KOKKOSKERNELS_GENERATE_ETI(Blas_gesv gesv - COMPONENTS blas - HEADER_LIST ETI_HEADERS - SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES -) - KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby axpby COMPONENTS blas HEADER_LIST ETI_HEADERS @@ -324,10 +317,3 @@ KOKKOSKERNELS_GENERATE_ETI(Blas3_trmm trmm SOURCE_LIST SOURCES TYPE_LISTS FLOATS LAYOUTS DEVICES ) - -KOKKOSKERNELS_GENERATE_ETI(Blas_trtri trtri - COMPONENTS blas - HEADER_LIST ETI_HEADERS - SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES -) diff --git a/blas/src/KokkosBlas_gesv.hpp b/blas/src/KokkosBlas_gesv.hpp index 89b9d36c96..1326c6fb8e 100644 --- a/blas/src/KokkosBlas_gesv.hpp +++ b/blas/src/KokkosBlas_gesv.hpp @@ -25,10 +25,7 @@ #ifndef KOKKOSBLAS_GESV_HPP_ #define KOKKOSBLAS_GESV_HPP_ -#include - -#include "KokkosBlas_gesv_spec.hpp" -#include "KokkosKernels_Error.hpp" +#include "KokkosLapack_gesv.hpp" namespace KokkosBlas { @@ -49,100 +46,8 @@ namespace KokkosBlas { /// its data pointer is NULL, pivoting is not used. /// template -void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { - // NOTE: Currently, KokkosBlas::gesv only supports for MAGMA TPL and BLAS TPL. - // MAGMA TPL should be enabled to call the MAGMA GPU interface for - // device views BLAS TPL should be enabled to call the BLAS interface - // for host views - - static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: A must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: B must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: IPIV must be a Kokkos::View."); - static_assert(static_cast(AMatrix::rank) == 2, - "KokkosBlas::gesv: A must have rank 2."); - static_assert( - static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, - "KokkosBlas::gesv: B must have either rank 1 or rank 2."); - static_assert(static_cast(IPIVV::rank) == 1, - "KokkosBlas::gesv: IPIV must have rank 1."); - - int64_t IPIV0 = IPIV.extent(0); - int64_t A0 = A.extent(0); - int64_t A1 = A.extent(1); - int64_t B0 = B.extent(0); - - // Check validity of pivot argument - bool valid_pivot = - (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); - if (!(valid_pivot)) { - std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "Valid options include zero-extent 1-D view (no pivoting), or 1-D " - "View with size of " - << A0 << " (partial pivoting)."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - // Check for no pivoting case. Only MAGMA supports no pivoting interface -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL - if ((!std::is_same::value) && - (IPIV0 == 0) && (IPIV.data() == nullptr)) { - std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "BLAS TPL does not support no pivoting."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } -#endif -#else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL - if ((IPIV0 == 0) && (IPIV.data() == nullptr)) { - std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "BLAS TPL does not support no pivoting."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } -#endif -#endif - - // Check compatibility of dimensions at run time. - if ((A0 < A1) || (A0 != B0)) { - std::ostringstream os; - os << "KokkosBlas::gesv: Dimensions of A, and B do not match: " - << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) - << " x " << B.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - typedef Kokkos::View< - typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, - typename AMatrix::device_type, Kokkos::MemoryTraits > - AMatrix_Internal; - typedef Kokkos::View > - BXMV_Internal; - typedef Kokkos::View< - typename IPIVV::non_const_value_type*, typename IPIVV::array_layout, - typename IPIVV::device_type, Kokkos::MemoryTraits > - IPIVV_Internal; - AMatrix_Internal A_i = A; - // BXMV_Internal B_i = B; - IPIVV_Internal IPIV_i = IPIV; - - if (BXMV::rank == 1) { - auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); - KokkosBlas::Impl::GESV::gesv(A_i, B_i, IPIV_i); - } else { // BXMV::rank == 2 - auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); - KokkosBlas::Impl::GESV::gesv(A_i, B_i, IPIV_i); - } +[[deprecated]] void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { + KokkosLapack::gesv(A, B, IPIV); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp index b1a34f0483..d9771e3a16 100644 --- a/blas/src/KokkosBlas_trtri.hpp +++ b/blas/src/KokkosBlas_trtri.hpp @@ -18,12 +18,7 @@ /// \file KokkosBlas_trtri.hpp -#include "KokkosKernels_Macros.hpp" -#include "KokkosBlas_trtri_spec.hpp" -#include "KokkosKernels_helpers.hpp" -#include -#include -#include "KokkosKernels_Error.hpp" +#include "KokkosLapack_trtri.hpp" namespace KokkosBlas { @@ -48,70 +43,9 @@ namespace KokkosBlas { // and the inversion could not be completed. // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri template -int trtri(const char uplo[], const char diag[], const AViewType& A) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - - // Check validity of indicator argument - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); - - if (!valid_uplo) { - std::ostringstream os; - os << "KokkosBlas::trtri: uplo = '" << uplo[0] << "'. " - << "Valid values include 'U' or 'u' (A is upper triangular), " - "'L' or 'l' (A is lower triangular)."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - if (!valid_diag) { - std::ostringstream os; - os << "KokkosBlas::trtri: diag = '" << diag[0] << "'. " - << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be " - "unit), " - "'N' or 'n' (the diagonal of A is assumed to be non-unit)."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - int64_t A_m = A.extent(0); - int64_t A_n = A.extent(1); - - // Return if degenerated matrices are provided - if (A_m == 0 || A_n == 0) - return 0; // This is success as the inverse of a matrix with no elements is - // itself. - - // Ensure that the dimensions of A match and that we can legally perform A*B - // or B*A - if (A_m != A_n) { - std::ostringstream os; - os << "KokkosBlas::trtri: Dimensions of A do not match," - << " A: " << A.extent(0) << " x " << A.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - // Create A matrix view type alias - using AViewInternalType = - Kokkos::View >; - - // This is the return value type and should always reside on host - using RViewInternalType = - Kokkos::View >; - - int result; - RViewInternalType R = RViewInternalType(&result); - - KokkosBlas::Impl::TRTRI::trtri(R, uplo, - diag, A); - - return result; +[[deprecated]] int trtri(const char uplo[], const char diag[], + const AViewType& A) { + return KokkosLapack::trtri(uplo, diag, A); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index b85f6109e8..6b158f4d19 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -382,39 +382,6 @@ void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, const std::complex*, const std::complex*, int*, /* */ std::complex*, int*); - -/// -/// Gesv -/// - -void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, - int*); -void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, - int*, int*); -void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, - std::complex*, int*, int*); -void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, - int*, std::complex*, int*, int*); - -/// -/// Trtri -/// -/* - HostBlas::trtri(const char uplo, const char diag, - int n, const float *a, int lda) { - int info = 0; - F77_FUNC_STRTRI(&uplo, - &diag, &n, - a, &lda, &info); -*/ -void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, - const float*, int*, int*); -void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, - const double*, int*, int*); -void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); -void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); } void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, @@ -524,16 +491,6 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CTRSM F77_BLAS_MANGLE(ctrsm, CTRSM) #define F77_FUNC_ZTRSM F77_BLAS_MANGLE(ztrsm, ZTRSM) -#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) -#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv, DGESV) -#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) -#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) - -#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) -#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) -#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) -#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri, ZTRTRI) - namespace KokkosBlas { namespace Impl { @@ -647,18 +604,6 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } -template <> -void HostBlas::gesv(int n, int rhs, float* a, int lda, int* ipiv, - float* b, int ldb, int info) { - F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas::trtri(const char uplo, const char diag, int n, - const float* a, int lda) { - int info = 0; - F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// double @@ -771,18 +716,6 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } -template <> -void HostBlas::gesv(int n, int rhs, double* a, int lda, int* ipiv, - double* b, int ldb, int info) { - F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas::trtri(const char uplo, const char diag, int n, - const double* a, int lda) { - int info = 0; - F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// std::complex @@ -944,21 +877,6 @@ void HostBlas >::trsm(const char side, const char uplo, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } -template <> -void HostBlas >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { - F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { - int info = 0; - F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// std::complex @@ -1118,21 +1036,6 @@ void HostBlas >::trsm( (const std::complex*)a, &lda, (std::complex*)b, &ldb); } -template <> -void HostBlas >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { - F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { - int info = 0; - F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 6f6c34dc25..06a5620155 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -106,12 +106,6 @@ struct HostBlas { const char diag, int m, int n, const T alpha, const T *a, int lda, /* */ T *b, int ldb); - - static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, - int info); - - static int trtri(const char uplo, const char diag, int n, const T *a, - int lda); }; } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp deleted file mode 100644 index de9fc08c99..0000000000 --- a/blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ /dev/null @@ -1,107 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ - -namespace KokkosBlas { -namespace Impl { - -// Specialization struct which defines whether a specialization exists -template -struct trtri_tpl_spec_avail { - enum : bool { value = false }; -}; - -// Generic Host side LAPACK (could be MKL or whatever) -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ - template \ - struct trtri_tpl_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ - }; - -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, MEMSPACE) \ - KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) -#else -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, MEMSPACE) -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS - -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) \ - KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) -#else -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA - -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) - -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) - -} // namespace Impl -} // namespace KokkosBlas - -#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index 1f4f130e8b..a29c5ffd72 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -16,9 +16,6 @@ #ifndef TEST_BLAS_HPP #define TEST_BLAS_HPP -#include "Test_Blas_gesv.hpp" -#include "Test_Blas_trtri.hpp" - // Blas 1 #include "Test_Blas1_abs.hpp" #include "Test_Blas1_asum.hpp" diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index b8b66fffbb..7a61771231 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -109,6 +109,8 @@ /* BLAS library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_BLAS +/* LAPACK */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL /* CUSPARSE */ diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index 1feb5bb8b8..16a784bd1f 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -29,6 +29,13 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to build the blas component. Default: OFF" ) +KOKKOSKERNELS_ADD_OPTION( + "ENABLE_COMPONENT_LAPACK" + OFF + BOOL + "Whether to build the lapack component. Default: OFF" +) + # SPARSE depends on everything else at the moment. KOKKOSKERNELS_ADD_OPTION( "ENABLE_COMPONENT_SPARSE" @@ -67,6 +74,7 @@ ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_LAPACK ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() @@ -74,6 +82,7 @@ ENDIF() IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_LAPACK ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_ODE ON CACHE BOOL "" FORCE) @@ -85,6 +94,7 @@ ENDIF() # but marking it as advanced should hide it from GUIs IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED AND KokkosKernels_ENABLE_COMPONENT_BLAS + AND KokkosKernels_ENABLE_COMPONENT_LAPACK AND KokkosKernels_ENABLE_COMPONENT_GRAPH AND KokkosKernels_ENABLE_COMPONENT_SPARSE AND KokkosKernels_ENABLE_COMPONENT_ODE) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt new file mode 100644 index 0000000000..8ab784a325 --- /dev/null +++ b/lapack/CMakeLists.txt @@ -0,0 +1,67 @@ +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/impl) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/eti) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/lapack/eti) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/tpls) + +# Adding unit-tests +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/lapack) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/lapack) + +######################### +# # +# Logic for LAPACK TPLs # +# # +######################### + +#Include LAPACK, Lapack host wrapper +IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) + #Do NOT add this to include path + APPEND_GLOB(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/tpls/KokkosLapack_Host_tpl.cpp) +ENDIF() + +# Include host lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Host_tpl.cpp + ) +ENDIF() + +# Include cuda lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Cuda_tpl.cpp + ) +ENDIF() + +# Include rocm lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Rocm_tpl.cpp + ) +ENDIF() + +################## +# # +# ETI generation # +# # +################## + +#Build up a list of DECL, AVAIL, and INST macros +#that should be instantiated based on input options +#Generate @X@ variables in the template X.hpp.in and X.cpp.in +#files containing the list of all needed macros + +KOKKOSKERNELS_GENERATE_ETI(Lapack_gesv gesv + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Lapack_trtri trtri + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) diff --git a/blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in similarity index 88% rename from blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in rename to lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in index 32473be3ad..da521984a4 100644 --- a/blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in +++ b/lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in @@ -17,10 +17,10 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true #include "KokkosKernels_config.h" -#include "KokkosBlas_gesv_spec.hpp" +#include "KokkosLapack_gesv_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -@BLAS_GESV_ETI_INST_BLOCK@ +@LAPACK_GESV_ETI_INST_BLOCK@ } //IMPL } //Kokkos diff --git a/blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in similarity index 88% rename from blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in rename to lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in index 64755f7a54..c4ab12f5a4 100644 --- a/blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in +++ b/lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in @@ -17,10 +17,10 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true #include "KokkosKernels_config.h" -#include "KokkosBlas_trtri_spec.hpp" +#include "KokkosLapack_trtri_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -@BLAS_TRTRI_ETI_INST_BLOCK@ +@LAPACK_TRTRI_ETI_INST_BLOCK@ } //IMPL } //Kokkos diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in similarity index 80% rename from blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in rename to lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in index ae262c912e..d1f36e3069 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { namespace Impl { -@BLAS_GESV_ETI_AVAIL_BLOCK@ +@LAPACK_GESV_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in similarity index 73% rename from blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in rename to lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in index 3f669efa06..89443c2c9b 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in @@ -14,13 +14,13 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { namespace Impl { -@BLAS_TRTRI_ETI_AVAIL_BLOCK@ +@LAPACK_TRTRI_ETI_AVAIL_BLOCK@ } // Impl -} // KokkosBlas -#endif // KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ +} // KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ diff --git a/blas/impl/KokkosBlas_gesv_impl.hpp b/lapack/impl/KokkosLapack_gesv_impl.hpp similarity index 73% rename from blas/impl/KokkosBlas_gesv_impl.hpp rename to lapack/impl/KokkosLapack_gesv_impl.hpp index e51e48309f..3a60f42171 100644 --- a/blas/impl/KokkosBlas_gesv_impl.hpp +++ b/lapack/impl/KokkosLapack_gesv_impl.hpp @@ -14,21 +14,21 @@ // //@HEADER -#ifndef KOKKOSBLAS_IMPL_GESV_HPP_ -#define KOKKOSBLAS_IMPL_GESV_HPP_ +#ifndef KOKKOSLAPACK_IMPL_GESV_HPP_ +#define KOKKOSLAPACK_IMPL_GESV_HPP_ -/// \file KokkosBlas_gesv_impl.hpp +/// \file KokkosLapack_gesv_impl.hpp /// \brief Implementation(s) of dense linear solve. #include #include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -// NOTE: Might add the implementation of KokkosBlas::gesv later +// NOTE: Might add the implementation of KokkosLapack::gesv later } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack -#endif // KOKKOSBLAS_IMPL_GESV_HPP +#endif // KOKKOSLAPACK_IMPL_GESV_HPP diff --git a/blas/impl/KokkosBlas_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp similarity index 74% rename from blas/impl/KokkosBlas_gesv_spec.hpp rename to lapack/impl/KokkosLapack_gesv_spec.hpp index f1dff467c8..b9f8549311 100644 --- a/blas/impl/KokkosBlas_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ -#define KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ +#ifndef KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ +#define KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ #include #include @@ -22,10 +22,10 @@ // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -#include +#include #endif -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -33,37 +33,37 @@ struct gesv_eti_spec_avail { enum : bool { value = false }; }; } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // Macro for declaration of full specialization availability -// KokkosBlas::Impl::GESV. This is NOT for users!!! All +// KokkosLapack::Impl::GESV. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct gesv_eti_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct gesv_eti_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations -#include -#include +#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Unification layer -/// \brief Implementation of KokkosBlas::gesv. +/// \brief Implementation of KokkosLapack::gesv. template ::value, @@ -79,54 +79,54 @@ template struct GESV { static void gesv(const AMatrix & /* A */, const BXMV & /* B */, const IPIVV & /* IPIV */) { - // NOTE: Might add the implementation of KokkosBlas::gesv later + // NOTE: Might add the implementation of KokkosLapack::gesv later throw std::runtime_error( "No fallback implementation of GESV (general LU factorization & solve) " - "exists. Enable BLAS and/or MAGMA TPL."); + "exists. Enable LAPACK and/or MAGMA TPL."); } }; #endif } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // Macro for declaration of full specialization of -// KokkosBlas::Impl::GESV. This is NOT for users!!! All +// KokkosLapack::Impl::GESV. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - extern template struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#include +#include -#endif // KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ +#endif // KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas_trtri_impl.hpp b/lapack/impl/KokkosLapack_trtri_impl.hpp similarity index 91% rename from blas/impl/KokkosBlas_trtri_impl.hpp rename to lapack/impl/KokkosLapack_trtri_impl.hpp index 4501763ea8..9f52c2d412 100644 --- a/blas/impl/KokkosBlas_trtri_impl.hpp +++ b/lapack/impl/KokkosLapack_trtri_impl.hpp @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_IMPL_HPP_ -#define KOKKOSBLAS_TRTRI_IMPL_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_IMPL_HPP_ +#define KOKKOSLAPACK_TRTRI_IMPL_HPP_ /** - * \file KokkosBlas_trtri_impl.hpp + * \file KokkosLapack_trtri_impl.hpp * \brief Implementation of triangular matrix inverse */ @@ -27,7 +27,7 @@ #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { template @@ -65,5 +65,5 @@ void SerialTrtri_Invoke(const RViewType &R, const char uplo[], } } } // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSBLAS_TRTRI_IMPL_HPP_ +} // namespace KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas_trtri_spec.hpp b/lapack/impl/KokkosLapack_trtri_spec.hpp similarity index 77% rename from blas/impl/KokkosBlas_trtri_spec.hpp rename to lapack/impl/KokkosLapack_trtri_spec.hpp index 2a4d2db576..a17184dc41 100644 --- a/blas/impl/KokkosBlas_trtri_spec.hpp +++ b/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -13,17 +13,17 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_SPEC_HPP_ -#define KOKKOSBLAS_TRTRI_SPEC_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_SPEC_HPP_ +#define KOKKOSLAPACK_TRTRI_SPEC_HPP_ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -#include +#include #endif -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -31,14 +31,14 @@ struct trtri_eti_spec_avail { enum : bool { value = false }; }; } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // This Macros provides the ETI specialization of trtri, currently not // available. // -#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ template <> \ struct trtri_eti_spec_avail< \ Kokkos::View -#include +#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // @@ -77,8 +77,8 @@ struct TRTRI { static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::trtri[ETI]" - : "KokkosBlas::trtri[noETI]"); + ? "KokkosLapack::trtri[ETI]" + : "KokkosLapack::trtri[noETI]"); typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A); typename RVIT::HostMirror host_R = Kokkos::create_mirror_view(R); @@ -97,7 +97,7 @@ struct TRTRI { //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // These Macros are only included when we are not compiling libkokkoskernels but @@ -106,22 +106,24 @@ struct TRTRI { // "extern template" skips the implicit instatiation step ensuring that the // callers code uses this explicit instantiation definition of TRTRI. // -#define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ - extern template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ + extern template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ - template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ + template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#include +#include -#endif // KOKKOSBLAS_TRTRI_SPEC_HPP_ +#endif // KOKKOSLAPACK_TRTRI_SPEC_HPP_ diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp new file mode 100644 index 0000000000..4c9058f8ab --- /dev/null +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -0,0 +1,151 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosLapack_gesv.hpp +/// \brief Local dense linear solve +/// +/// This file provides KokkosLapack::gesv. This function performs a +/// local (no MPI) dense linear solve on a system of linear equations +/// A * X = B where A is a general N-by-N matrix and X and B are N-by-NRHS +/// matrices. + +#ifndef KOKKOSLAPACK_GESV_HPP_ +#define KOKKOSLAPACK_GESV_HPP_ + +#include + +#include "KokkosLapack_gesv_spec.hpp" +#include "KokkosKernels_Error.hpp" + +namespace KokkosLapack { + +/// \brief Solve the dense linear equation system A*X = B. +/// +/// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View. +/// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a +/// 1-D or 2-D Kokkos::View. \tparam IPIVV Output pivot indices, as a 1-D +/// Kokkos::View +/// +/// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the +/// factors L and U from +/// the factorization A = P*L*U; the unit diagonal elements of L are not +/// stored. +/// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, +/// the solution (multi)vector X. \param IPIV [out] On exit, the pivot indices +/// (for partial pivoting). If the View extents are zero and +/// its data pointer is NULL, pivoting is not used. +/// +template +void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { + // NOTE: Currently, KokkosLapack::gesv only supports for MAGMA TPL and LAPACK + // TPL. + // MAGMA TPL should be enabled to call the MAGMA GPU interface for + // device views LAPACK TPL should be enabled to call the LAPACK + // interface for host views + + static_assert(Kokkos::is_view::value, + "KokkosLapack::gesv: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::gesv: B must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::gesv: IPIV must be a Kokkos::View."); + static_assert(static_cast(AMatrix::rank) == 2, + "KokkosLapack::gesv: A must have rank 2."); + static_assert( + static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, + "KokkosLapack::gesv: B must have either rank 1 or rank 2."); + static_assert(static_cast(IPIVV::rank) == 1, + "KokkosLapack::gesv: IPIV must have rank 1."); + + int64_t IPIV0 = IPIV.extent(0); + int64_t A0 = A.extent(0); + int64_t A1 = A.extent(1); + int64_t B0 = B.extent(0); + + // Check validity of pivot argument + bool valid_pivot = + (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); + if (!(valid_pivot)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "Valid options include zero-extent 1-D view (no pivoting), or 1-D " + "View with size of " + << A0 << " (partial pivoting)."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Check for no pivoting case. Only MAGMA supports no pivoting interface +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL + if ((!std::is_same::value) && + (IPIV0 == 0) && (IPIV.data() == nullptr)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif +#else // not have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL + if ((IPIV0 == 0) && (IPIV.data() == nullptr)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif +#endif + + // Check compatibility of dimensions at run time. + if ((A0 < A1) || (A0 != B0)) { + std::ostringstream os; + os << "KokkosLapack::gesv: Dimensions of A, and B do not match: " + << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) + << " x " << B.extent(1); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + typedef Kokkos::View< + typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, + typename AMatrix::device_type, Kokkos::MemoryTraits > + AMatrix_Internal; + typedef Kokkos::View > + BXMV_Internal; + typedef Kokkos::View< + typename IPIVV::non_const_value_type*, typename IPIVV::array_layout, + typename IPIVV::device_type, Kokkos::MemoryTraits > + IPIVV_Internal; + AMatrix_Internal A_i = A; + // BXMV_Internal B_i = B; + IPIVV_Internal IPIV_i = IPIV; + + if (BXMV::rank == 1) { + auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); + KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + } else { // BXMV::rank == 2 + auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); + KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + } +} + +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_GESV_HPP_ diff --git a/lapack/src/KokkosLapack_trtri.hpp b/lapack/src/KokkosLapack_trtri.hpp new file mode 100644 index 0000000000..9a884f2303 --- /dev/null +++ b/lapack/src/KokkosLapack_trtri.hpp @@ -0,0 +1,119 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_TRTRI_HPP_ +#define KOKKOSLAPACK_TRTRI_HPP_ + +/// \file KokkosLapack_trtri.hpp + +#include "KokkosKernels_Macros.hpp" +#include "KokkosLapack_trtri_spec.hpp" +#include "KokkosKernels_helpers.hpp" +#include +#include +#include "KokkosKernels_Error.hpp" + +namespace KokkosLapack { + +/// \brief Find the inverse of the triangular matrix, A +/// +/// A = inv(A) +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// +/// \param uplo [in] "U" or "u" indicates matrix A is an upper triangular +/// matrix +/// "L" or "l" indicates matrix A is a lower triangular matrix +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit +// "N" or "n" indicates the diagonal of A is assumed to be +// non-unit +/// \param A [in,out] Input matrix, as a 2-D Kokkos::View +/// On entry, A +/// On successful exit, inv(A) +/// \return 0 upon success, +// i if the i-th diagonal elemet of A is zero, A is singular, +// and the inversion could not be completed. +// source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri +template +int trtri(const char uplo[], const char diag[], const AViewType& A) { + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, + "AViewType must have rank 2."); + + // Check validity of indicator argument + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || + (uplo[0] == 'l'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || + (diag[0] == 'n'); + + if (!valid_uplo) { + std::ostringstream os; + os << "KokkosLapack::trtri: uplo = '" << uplo[0] << "'. " + << "Valid values include 'U' or 'u' (A is upper triangular), " + "'L' or 'l' (A is lower triangular)."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + if (!valid_diag) { + std::ostringstream os; + os << "KokkosLapack::trtri: diag = '" << diag[0] << "'. " + << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be " + "unit), " + "'N' or 'n' (the diagonal of A is assumed to be non-unit)."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + int64_t A_m = A.extent(0); + int64_t A_n = A.extent(1); + + // Return if degenerated matrices are provided + if (A_m == 0 || A_n == 0) + return 0; // This is success as the inverse of a matrix with no elements is + // itself. + + // Ensure that the dimensions of A match and that we can legally perform A*B + // or B*A + if (A_m != A_n) { + std::ostringstream os; + os << "KokkosLapack::trtri: Dimensions of A do not match," + << " A: " << A.extent(0) << " x " << A.extent(1); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Create A matrix view type alias + using AViewInternalType = + Kokkos::View >; + + // This is the return value type and should always reside on host + using RViewInternalType = + Kokkos::View >; + + int result; + RViewInternalType R = RViewInternalType(&result); + + KokkosLapack::Impl::TRTRI::trtri( + R, uplo, diag, A); + + return result; +} + +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_TRTRI_HPP_ diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.cpp b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp new file mode 100644 index 0000000000..2ac28871a4 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp @@ -0,0 +1,18 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#include +#include +#include diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp new file mode 100644 index 0000000000..b59d6d99c8 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_CUDA_TPL_HPP_ +#define KOKKOSLAPACK_CUDA_TPL_HPP_ + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +#include + +namespace KokkosLapack { +namespace Impl { + +CudaLapackSingleton::CudaLapackSingleton() { + cusolverStatus_t stat = cusolverDnCreate(&handle); + if (stat != CUSOLVER_STATUS_SUCCESS) + Kokkos::abort("CUSOLVER initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); +} + +CudaLapackSingleton& CudaLapackSingleton::singleton() { + static CudaLapackSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#include + +namespace KokkosLapack { +namespace Impl { + +MagmaSingleton::MagmaSingleton() { + magma_int_t stat = magma_init(); + if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { magma_finalize(); }); +} + +MagmaSingleton& MagmaSingleton::singleton() { + static MagmaSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + +#endif // KOKKOSLAPACK_CUDA_TPL_HPP_ diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp new file mode 100644 index 0000000000..d629a17f1d --- /dev/null +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -0,0 +1,152 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \file KokkosLapack_Host_tpl.cpp +/// \brief LAPACK wrapper for host tpls +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosKernels_config.h" +#include "KokkosLapack_Host_tpl.hpp" + +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) + +/// Fortran headers +extern "C" { + +/// +/// Gesv +/// + +void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, + int*); +void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, + int*, int*); +void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, + std::complex*, int*, int*); +void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, + int*, std::complex*, int*, int*); + +/// +/// Trtri +/// +/* + HostLapack::trtri(const char uplo, const char diag, + int n, const float *a, int lda) { + int info = 0; + F77_FUNC_STRTRI(&uplo, + &diag, &n, + a, &lda, &info); +*/ +void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, + const float*, int*, int*); +void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, + const double*, int*, int*); +void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, + const std::complex*, int*, int*); +void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, + const std::complex*, int*, int*); +} + +#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) +#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv, DGESV) +#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) +#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) + +#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) +#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) +#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) +#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri, ZTRTRI) + +namespace KokkosLapack { +namespace Impl { + +/// +/// float +/// + +template <> +void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, + float* b, int ldb, int info) { + F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack::trtri(const char uplo, const char diag, int n, + const float* a, int lda) { + int info = 0; + F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// double +/// + +template <> +void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, + double* b, int ldb, int info) { + F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack::trtri(const char uplo, const char diag, int n, + const double* a, int lda) { + int info = 0; + F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// std::complex +/// + +template <> +void HostLapack >::gesv(int n, int rhs, + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { + F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack >::trtri(const char uplo, const char diag, + int n, const std::complex* a, + int lda) { + int info = 0; + F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// std::complex +/// + +template <> +void HostLapack >::gesv(int n, int rhs, + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { + F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack >::trtri(const char uplo, const char diag, + int n, + const std::complex* a, + int lda) { + int info = 0; + F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK diff --git a/lapack/tpls/KokkosLapack_Host_tpl.hpp b/lapack/tpls/KokkosLapack_Host_tpl.hpp new file mode 100644 index 0000000000..d74099aaec --- /dev/null +++ b/lapack/tpls/KokkosLapack_Host_tpl.hpp @@ -0,0 +1,44 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_HOST_TPL_HPP_ +#define KOKKOSLAPACK_HOST_TPL_HPP_ + +/// \file KokkosLapack_Host_tpl.hpp +/// \brief LAPACK wrapper + +#include "KokkosKernels_config.h" +#include "Kokkos_ArithTraits.hpp" + +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) + +namespace KokkosLapack { +namespace Impl { + +template +struct HostLapack { + static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, + int info); + + static int trtri(const char uplo, const char diag, int n, const T *a, + int lda); +}; +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK + +#endif // KOKKOSLAPACK_HOST_TPL_HPP_ diff --git a/blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp similarity index 60% rename from blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp rename to lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index f909b4a295..a3d8bb6ee9 100644 --- a/blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -14,10 +14,10 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_HPP_ +#ifndef KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -25,10 +25,10 @@ struct gesv_tpl_spec_avail { enum : bool { value = false }; }; -// Generic Host side BLAS (could be MKL or whatever) -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +// Generic Host side LAPACK (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -38,30 +38,30 @@ struct gesv_tpl_spec_avail { enum : bool { value = true }; \ }; -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( double, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_FLOAT) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( float, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #endif */ #endif @@ -69,7 +69,7 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -79,36 +79,36 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif enum : bool { value = true }; \ }; -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_FLOAT) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex,Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif */ #endif } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack #endif diff --git a/blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp similarity index 87% rename from blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp rename to lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 7d8f0a8a2b..2baa76a132 100644 --- a/blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -14,21 +14,21 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_TPL_SPEC_DECL_HPP_ -#define KOKKOSBLAS_GESV_TPL_SPEC_DECL_HPP_ +#ifndef KOKKOSLAPACK_GESV_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_GESV_TPL_SPEC_DECL_HPP_ -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { template inline void gesv_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - printf("KokkosBlas::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", + printf("KokkosLapack::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", typeid(AViewType).name(), typeid(BViewType).name(), typeid(PViewType).name()); #else -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - printf("KokkosBlas::gesv<> TPL Blas specialization for < %s , %s, %s >\n", +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK + printf("KokkosLapack::gesv<> TPL Lapack specialization for < %s , %s, %s >\n", typeid(AViewType).name(), typeid(BViewType).name(), typeid(PViewType).name()); #endif @@ -36,16 +36,16 @@ inline void gesv_print_specialization() { #endif } } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack -// Generic Host side BLAS (could be MKL or whatever) -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#include +// Generic Host side LAPACK (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#define KOKKOSBLAS_DGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -74,7 +74,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -89,65 +89,65 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), \ + B.data(), LDB, info); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS_SGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,float]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostBlas::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View< \ + int*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + PViewType; \ + \ + static void gesv(const AViewType& A, const BViewType& B, \ + const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,float]"); \ + gesv_print_specialization(); \ + const bool with_pivot = \ + !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ + \ + const int N = static_cast(A.extent(1)); \ + const int AST = static_cast(A.stride(1)); \ + const int LDA = (AST == 0) ? 1 : AST; \ + const int BST = static_cast(B.stride(1)); \ + const int LDB = (BST == 0) ? 1 : BST; \ + const int NRHS = static_cast(B.extent(1)); \ + \ + int info = 0; \ + \ + if (with_pivot) { \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ + LDB, info); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS_ZGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -178,7 +178,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_BLAS,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -193,7 +193,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -202,7 +202,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_CGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -233,7 +233,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_BLAS,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -248,7 +248,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -257,30 +257,30 @@ namespace Impl { } \ }; -KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#define KOKKOSBLAS_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -309,7 +309,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -321,8 +321,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -339,7 +339,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -368,7 +368,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,float]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -380,8 +380,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -398,7 +398,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -429,7 +429,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -441,8 +441,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -459,7 +459,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -490,7 +490,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -502,8 +502,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -520,20 +520,20 @@ namespace Impl { } \ }; -KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA #endif diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp new file mode 100644 index 0000000000..7251d97086 --- /dev/null +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp @@ -0,0 +1,133 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosLapack { +namespace Impl { + +// Specialization struct which defines whether a specialization exists +template +struct trtri_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side LAPACK (could be MKL or whatever) +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ + template \ + struct trtri_tpl_spec_avail< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUTA, MEMSPACE) \ + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) +#else +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUTA, MEMSPACE) +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) \ + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) +#else +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSLAPACKy_TRTRI_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp similarity index 50% rename from blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp rename to lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 46ec894547..3ed0623018 100644 --- a/blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -14,18 +14,18 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ -#define KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ -#include "KokkosBlas_Host_tpl.hpp" // trtri prototype -#include "KokkosBlas_tpl_spec.hpp" +#include "KokkosLapack_Host_tpl.hpp" // trtri prototype +//#include "KokkosLapack_tpl_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ + MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRTRI >, \ @@ -44,8 +44,8 @@ namespace Impl { \ static void trtri(const RViewType& R, const char uplo[], \ const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ const int M = static_cast(A.extent(0)); \ \ bool A_is_layout_left = \ @@ -61,136 +61,164 @@ namespace Impl { else \ uplo_ = A_is_layout_left ? 'U' : 'L'; \ \ - R() = HostBlas::trtri( \ + R() = HostLapack::trtri( \ uplo_, diag[0], M, \ reinterpret_cast(A.data()), LDA); \ Kokkos::Profiling::popRegion(); \ } \ }; #else -#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ + MEM_SPACE, ETI_SPEC_AVAIL) +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ - LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRTRI >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ - RViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void trtri(const RViewType& R, const char uplo[], \ - const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - magma_int_t M = static_cast(A.extent(0)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - \ - magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - magma_int_t info = 0; \ - magma_uplo_t uplo_; \ - magma_diag_t diag_; \ - \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ - else \ - uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ - \ - if (diag[0] == 'U' || diag[0] == 'u') \ - diag_ = MagmaUnit; \ - else \ - diag_ = MagmaNonUnit; \ - \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ - R() = MAGMA_FN(uplo_, diag_, M, \ - reinterpret_cast( \ - const_cast(A.data())), \ - LDA, &info); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ + MAGMA_FN, LAYOUTA, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct TRTRI >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View > \ + RViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void trtri(const RViewType& R, const char uplo[], \ + const char diag[], const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ + magma_int_t M = static_cast(A.extent(0)); \ + \ + bool A_is_layout_left = \ + std::is_same::value; \ + \ + magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ + LDA = (AST == 0) ? 1 : AST; \ + magma_int_t info = 0; \ + magma_uplo_t uplo_; \ + magma_diag_t diag_; \ + \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ + else \ + uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ + \ + if (diag[0] == 'U' || diag[0] == 'u') \ + diag_ = MagmaUnit; \ + else \ + diag_ = MagmaNonUnit; \ + \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ + R() = MAGMA_FN(uplo_, diag_, M, \ + reinterpret_cast( \ + const_cast(A.data())), \ + LDA, &info); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #else -#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ - LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ + MAGMA_FN, LAYOUTA, MEM_SPACE, \ + ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA // Explicitly define the TRTRI class for all permutations listed below // Handle type and space permutations -#define KOKKOSBLAS_DTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS_STRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS_ZTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ - magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ - magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS_CTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ - magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ - magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ - ETI_SPEC_AVAIL) +#ifdef KOKKOS_ENABLE_CUDA + +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ + std::complex, LAYOUTA, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ + magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ + Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ + LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ + magmaFloatComplex_ptr, magma_ctrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ + Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + +#else + +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ + std::complex, LAYOUTA, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ + LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) + +#endif // Handle layout permutations -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, false) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutRight, false) } // namespace Impl -} // nameSpace KokkosBlas +} // nameSpace KokkosLapack -#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ +#endif // KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ diff --git a/lapack/unit_test/CMakeLists.txt b/lapack/unit_test/CMakeLists.txt new file mode 100644 index 0000000000..a2c2305a12 --- /dev/null +++ b/lapack/unit_test/CMakeLists.txt @@ -0,0 +1,94 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/test_common) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${PACKAGE_SOURCE_DIR}/test_common) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + +##################### +# # +# Define unit-tests # +# # +##################### + +##################### +# # +# Add GPU backends # +# # +##################### +IF (KOKKOS_ENABLE_CUDA) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_cuda + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Cuda_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_hip + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_HIP_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_sycl + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_SYCL_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMPTARGET) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # lapack_openmptarget + # SOURCES + # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + # backends/Test_OpenMPTarget_Lapack.cpp + # COMPONENTS lapack + # ) +ENDIF () + + + +##################### +# # +# Add CPU backends # +# # +##################### +IF (KOKKOS_ENABLE_SERIAL) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_serial + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Serial_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMP) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_openmp + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_OpenMP_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_THREADS) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_threads + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Threads_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + diff --git a/lapack/unit_test/Test_Lapack.hpp b/lapack/unit_test/Test_Lapack.hpp new file mode 100644 index 0000000000..815c442884 --- /dev/null +++ b/lapack/unit_test/Test_Lapack.hpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_LAPACK_HPP +#define TEST_LAPACK_HPP + +#include "Test_Lapack_gesv.hpp" +#include "Test_Lapack_trtri.hpp" + +#endif // TEST_LAPACK_HPP diff --git a/blas/unit_test/Test_Blas_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp similarity index 89% rename from blas/unit_test/Test_Blas_gesv.hpp rename to lapack/unit_test/Test_Lapack_gesv.hpp index 57ee6373bf..06f51b7eb0 100644 --- a/blas/unit_test/Test_Blas_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -14,19 +14,20 @@ // //@HEADER -// only enable this test where KokkosBlas supports gesv: -// CUDA+MAGMA and HOST+BLAS -#if (defined(TEST_CUDA_BLAS_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ - (defined(TEST_OPENMP_BLAS_CPP) || defined(TEST_OPENMPTARGET_BLAS_CPP) || \ - defined(TEST_SERIAL_BLAS_CPP) || defined(TEST_THREADS_BLAS_CPP))) +// only enable this test where KokkosLapack supports gesv: +// CUDA+MAGMA and HOST+LAPACK +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || \ + defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ + defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include #include #include -#include +#include #include #include #include @@ -89,15 +90,15 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Solve. try { - KokkosBlas::gesv(A, B, ipiv); + KokkosLapack::gesv(A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) // and no-tpl case bool nopivot_runtime_err = false; bool notpl_runtime_err = false; -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -105,8 +106,8 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { #else notpl_runtime_err = true; #endif -#else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL +#else // not have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -194,15 +195,15 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, // Solve. try { - KokkosBlas::gesv(A, B, ipiv); + KokkosLapack::gesv(A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) // and no-tpl case bool nopivot_runtime_err = false; bool notpl_runtime_err = false; -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -210,8 +211,8 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, #else notpl_runtime_err = true; #endif -#else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL +#else // not have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -342,14 +343,14 @@ int test_gesv_mrhs(const char* mode) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_float"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_float"); test_gesv("N"); // No pivoting test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_float"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_float"); test_gesv_mrhs("N"); // No pivoting test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); @@ -360,14 +361,14 @@ TEST_F(TestCategory, gesv_mrhs_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_double"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_double"); test_gesv("N"); // No pivoting test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_double"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_double"); test_gesv_mrhs("N"); // No pivoting test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); @@ -378,14 +379,14 @@ TEST_F(TestCategory, gesv_mrhs_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_double"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_double"); test_gesv, TestDevice>("N"); // No pivoting test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_double"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_double"); test_gesv_mrhs, TestDevice>("N"); // No pivoting test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); @@ -396,18 +397,18 @@ TEST_F(TestCategory, gesv_mrhs_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_float"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_float"); test_gesv, TestDevice>("N"); // No pivoting test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_float"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_float"); test_gesv_mrhs, TestDevice>("N"); // No pivoting test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif -#endif // CUDA+MAGMA or BLAS+HOST +#endif // CUDA+MAGMA or LAPACK+HOST diff --git a/blas/unit_test/Test_Blas_trtri.hpp b/lapack/unit_test/Test_Lapack_trtri.hpp similarity index 94% rename from blas/unit_test/Test_Blas_trtri.hpp rename to lapack/unit_test/Test_Lapack_trtri.hpp index aa12fa959b..a19e575d89 100644 --- a/blas/unit_test/Test_Blas_trtri.hpp +++ b/lapack/unit_test/Test_Lapack_trtri.hpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -118,8 +118,8 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, // const int As0 = A.stride(0), As1 = A.stride(1); // const int Ae0 = A.extent(0), Ae1 = A.extent(1); - // printf("KokkosBlas::trtri test for %c %c, M %d, N %d, eps %g, ViewType: %s, - // A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d + // printf("KokkosLapack::trtri test for %c %c, M %d, N %d, eps %g, ViewType: + // %s, A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d // START\n", uplo[0],diag[0],M,N,eps,typeid(ViewTypeA).name(), As0, As1, Ae0, // Ae1); fflush(stdout); @@ -141,7 +141,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0); Kokkos::deep_copy(A, host_A); } - return KokkosBlas::trtri(uplo, diag, A); + return KokkosLapack::trtri(uplo, diag, A); } // If M is greater than 100 and A is an unit triangluar matrix, make A the @@ -158,13 +158,13 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, using functor_type = UnitDiagTRTRI; functor_type udtrtri(A); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", + Kokkos::parallel_for("KokkosLapack::Test::UnitDiagTRTRI", Kokkos::RangePolicy(0, M), udtrtri); } else { //(diag[0]=='N')||(diag[0]=='n') using functor_type = NonUnitDiagTRTRI; functor_type nudtrtri(A); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", + Kokkos::parallel_for("KokkosLapack::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy(0, M), nudtrtri); } Kokkos::fence(); @@ -195,11 +195,11 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, #endif // A = A^-1 - ret = KokkosBlas::trtri(uplo, diag, A); + ret = KokkosLapack::trtri(uplo, diag, A); Kokkos::fence(); if (ret) { - printf("KokkosBlas::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], + printf("KokkosLapack::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], typeid(ViewTypeA).name(), ret); return ret; } @@ -229,7 +229,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, vgemm.alpha = ScalarA(1); vgemm.beta = beta; Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", + "KokkosLapack::Test::VanillaGEMM", Kokkos::TeamPolicy( M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), @@ -362,7 +362,7 @@ int test_trtri(const char* mode) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_float"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_float"); test_trtri("UN"); test_trtri("UU"); test_trtri("LN"); @@ -375,7 +375,7 @@ TEST_F(TestCategory, trtri_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_double"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_double"); test_trtri("UN"); test_trtri("UU"); test_trtri("LN"); @@ -388,7 +388,7 @@ TEST_F(TestCategory, trtri_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_double"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_double"); test_trtri, TestDevice>("UN"); test_trtri, TestDevice>("UU"); test_trtri, TestDevice>("LN"); @@ -401,7 +401,7 @@ TEST_F(TestCategory, trtri_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_float"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_float"); test_trtri, TestDevice>("UN"); test_trtri, TestDevice>("UU"); test_trtri, TestDevice>("LN"); diff --git a/lapack/unit_test/backends/Test_Cuda_Lapack.cpp b/lapack/unit_test/backends/Test_Cuda_Lapack.cpp new file mode 100644 index 0000000000..d75988ef81 --- /dev/null +++ b/lapack/unit_test/backends/Test_Cuda_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_CUDA_LAPACK_CPP +#define TEST_CUDA_LAPACK_CPP + +#include +#include + +#endif // TEST_CUDA_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_HIP_Lapack.cpp b/lapack/unit_test/backends/Test_HIP_Lapack.cpp new file mode 100644 index 0000000000..c0ec152233 --- /dev/null +++ b/lapack/unit_test/backends/Test_HIP_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_HIP_LAPACK_CPP +#define TEST_HIP_LAPACK_CPP + +#include "Test_HIP.hpp" +#include "Test_Lapack.hpp" + +#endif // TEST_HIP_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp b/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp new file mode 100644 index 0000000000..533580fd23 --- /dev/null +++ b/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMP_LAPACK_CPP +#define TEST_OPENMP_LAPACK_CPP + +#include +#include + +#endif // TEST_OPENMP_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_Serial_Lapack.cpp b/lapack/unit_test/backends/Test_Serial_Lapack.cpp new file mode 100644 index 0000000000..d0324b9642 --- /dev/null +++ b/lapack/unit_test/backends/Test_Serial_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SERIAL_LAPACK_CPP +#define TEST_SERIAL_LAPACK_CPP + +#include +#include + +#endif // TEST_SERIAL_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_Threads_Lapack.cpp b/lapack/unit_test/backends/Test_Threads_Lapack.cpp new file mode 100644 index 0000000000..aa1acbcf6c --- /dev/null +++ b/lapack/unit_test/backends/Test_Threads_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_THREADS_LAPACK_CPP +#define TEST_THREADS_LAPACK_CPP + +#include +#include + +#endif // TEST_THREADS_LAPACK_CPP diff --git a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp index cbadcef0b1..de2db8dbb0 100644 --- a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp @@ -21,7 +21,7 @@ #include -#include +#include #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" @@ -185,7 +185,7 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { for (int i = 0; i < options.start.a.k; ++i) { auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosLapack::trtri(&trtri_args.uplo, &trtri_args.diag, A); } // Fence after each batch operation Kokkos::fence(); @@ -196,7 +196,7 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { for (int i = 0; i < options.start.a.k; ++i) { auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosLapack::trtri(&trtri_args.uplo, &trtri_args.diag, A); } // Fence after each batch operation Kokkos::fence(); @@ -300,7 +300,7 @@ struct parallel_blas_trtri { void operator()(const int& i) const { auto svA = Kokkos::subview(trtri_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); + KokkosLapack::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); } }; #endif // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP && diff --git a/sparse/src/KokkosSparse_sptrsv_supernode.hpp b/sparse/src/KokkosSparse_sptrsv_supernode.hpp index 0be3abac08..c6e5d406a7 100644 --- a/sparse/src/KokkosSparse_sptrsv_supernode.hpp +++ b/sparse/src/KokkosSparse_sptrsv_supernode.hpp @@ -27,7 +27,7 @@ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) #include "KokkosBlas3_trmm.hpp" -#include "KokkosBlas_trtri.hpp" +#include "KokkosLapack_trtri.hpp" #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" @@ -1472,12 +1472,12 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, // call trtri on device auto dViewLjj = Kokkos::subview(dViewL, range_type(0, nscol), Kokkos::ALL()); - KokkosBlas::trtri(&uplo_char, &diag_char, dViewLjj); + KokkosLapack::trtri(&uplo_char, &diag_char, dViewLjj); } else #endif { // call trtri on host - KokkosBlas::trtri(&uplo_char, &diag_char, Ljj); + KokkosLapack::trtri(&uplo_char, &diag_char, Ljj); } #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE time1 += timer.seconds(); From 315b4ec3593c426f738c6a009af35f9e97466472 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 27 Oct 2023 22:28:20 -0600 Subject: [PATCH 229/231] Merge pull request #2024 from eeprude/lapackDir_fix Quick fix on 'lapack' subdirectory, for night compilation with Trilinos (cherry picked from commit 9caa7ca90789278dc92e6d7411a073cc53fb8e31) --- .../backends/Test_OpenMPTarget_Lapack.cpp | 22 +++++++++++++++++++ .../unit_test/backends/Test_SYCL_Lapack.cpp | 22 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_SYCL_Lapack.cpp diff --git a/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp b/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp new file mode 100644 index 0000000000..5191918ce9 --- /dev/null +++ b/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMPTARGET_LAPACK_CPP +#define TEST_OPENMPTARGET_LAPACK_CPP + +#include "Test_OpenMPTarget.hpp" +#include "Test_Lapack.hpp" + +#endif // TEST_OPENMPTARGET_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_SYCL_Lapack.cpp b/lapack/unit_test/backends/Test_SYCL_Lapack.cpp new file mode 100644 index 0000000000..9485f2a420 --- /dev/null +++ b/lapack/unit_test/backends/Test_SYCL_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SYCL_LAPACK_CPP +#define TEST_SYCL_LAPACK_CPP + +#include +#include + +#endif // TEST_SYCL_LAPACK_CPP From 1ff8cd54b2cef7613ff5c5348b4d3cdee28a4e32 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Sun, 5 Nov 2023 19:07:06 -0700 Subject: [PATCH 230/231] Merge pull request #2029 from lucbv/sycl_mkl_trilinos_fix SYCL: fix for Trilinos build with MKL (cherry picked from commit 78455b485ee827f699dbc6c87767290bcdc4dc4d) --- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 4 +++- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 6 ++++-- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 3 ++- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 4 +++- cmake/KokkosKernels_config.h.in | 1 + cmake/kokkoskernels_tpls.cmake | 4 ++++ sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 3 ++- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 3 ++- 8 files changed, 21 insertions(+), 7 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 7bc55becc0..de930f6107 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -88,7 +88,9 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #endif diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 62139d2b12..736523aa8d 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -364,7 +364,9 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) #include #include #include @@ -437,6 +439,6 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(false) } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL && KOKKOS_ENABLE_SYCL #endif diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 70b5560f6e..0820badd9a 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -161,7 +161,8 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#ifdef KOKKOS_ENABLE_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) #define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ template \ diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 304dd349bf..2ace065808 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -767,7 +767,9 @@ KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS // ONEMKL -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 7a61771231..d94860e380 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -29,6 +29,7 @@ requires (a) header file(s) as well, and may use functions other than just BLAS and LAPACK functions. */ #cmakedefine HAVE_KOKKOSKERNELS_MKL +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE #cmakedefine KOKKOSKERNELS_ENABLE_BENCHMARK diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index f650168757..08c7158148 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -31,6 +31,10 @@ MACRO(KOKKOSKERNELS_ADD_TPL_OPTION NAME DEFAULT_VALUE DOCSTRING) SET(ROOT_DEFAULT $ENV{${_NAME_ORIG}_ROOT}) KOKKOSKERNELS_ADD_OPTION(${_NAME_ORIG}_ROOT "${ROOT_DEFAULT}" PATH "Location of ${_NAME} install root. Default: None or the value of the environment variable ${_NAME}_ROOT if set") IF (DEFINED TPL_ENABLE_${_NAME}) + IF (${_NAME} STREQUAL MKL AND KOKKOSKERNELS_HAS_TRILINOS) + MESSAGE("Trilinos has enabled MKL and SYCL but it does not detect oneMKL correctly so we disable it!") + SET(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE ON) + ENDIF () IF (TPL_ENABLE_${_NAME} AND NOT KOKKOSKERNELS_ENABLE_TPL_${_NAME}) MESSAGE("Overriding KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG}=OFF with TPL_ENABLE_${_NAME}=ON") SET(KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG} ON) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 01a0ce1373..653ec94811 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -249,7 +249,8 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif -#ifdef KOKKOS_ENABLE_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 23d85c0b5c..efb591375b 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -684,7 +684,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #undef KOKKOSSPARSE_SPMV_MKL #endif -#ifdef KOKKOS_ENABLE_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { switch (toupper(mode_kk)) { case 'N': return oneapi::mkl::transpose::nontrans; From d0c412ed0971c52bae6d524591192d34a6e98bef Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 8 Nov 2023 14:26:26 -0700 Subject: [PATCH 231/231] Adding Changelog for Release 4.2.0 (#2031) * Adding Changelog for Release 4.2.0 Part of Kokkos C++ Performance Portability Programming EcoSystem 4.2 * Formatting the changelog a bit more Mentioning more clearly LAPACK vs BLAS, grouping PRs by logical work unit, etc... * Remove minor revisions, improve text descriptions * Changelog: add spmv perftest detail --------- Co-authored-by: Luc Berger Co-authored-by: Carl Pearson Co-authored-by: brian-kelley --- CHANGELOG.md | 261 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d582fc354f..59c3f5a647 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,266 @@ # Change Log +## [4.2.00](https://github.com/kokkos/kokkos-kernels/tree/4.2.00) (2023-11-06) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.1.00...4.2.00) + +### New Features + +#### BLAS updates +- Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) + +#### LAPACK +- New component added for the implementation of LAPACK algorithms and to support associated TPLs [\#1985](https://github.com/kokkos/kokkos-kernels/pull/1985) +- Fix some issue with unit-test definition for SYCL backend in the new LAPACK component [\#2024](https://github.com/kokkos/kokkos-kernels/pull/2024) + +#### Sparse updates +- Extract diagonal blocks from a CRS matrix into separate CRS matrices [\#1947](https://github.com/kokkos/kokkos-kernels/pull/1947) +- Adding exec space instance to spmv [\#1932](https://github.com/kokkos/kokkos-kernels/pull/1932) +- Add merge-based SpMV [\#1911](https://github.com/kokkos/kokkos-kernels/pull/1911) +- Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (PSGS and Team_PSGS) [\#1906](https://github.com/kokkos/kokkos-kernels/pull/1906) +- Add a MergeMatrixDiagonal abstraction to KokkosSparse [\#1780](https://github.com/kokkos/kokkos-kernels/pull/1780) + +#### ODE updates +- Newton solver [\#1924](https://github.com/kokkos/kokkos-kernels/pull/1924) + +### Enhancements: + +#### Sparse +- MDF performance improvements exposing more parallelism in the implementation + - MDF: convert remaining count functor to hierarchical parallelism [\#1894](https://github.com/kokkos/kokkos-kernels/pull/1894) + - MDF: move most expensive kernels over to hierarchical parallelism [\#1893](https://github.com/kokkos/kokkos-kernels/pull/1893) +- Improvements to the Block Crs Matrix-Vector multiplication algorithm + - Improve BSR matrix SpMV Performance [\#1740](https://github.com/kokkos/kokkos-kernels/pull/1740) + - Disallow BsrMatrix tensor-core SpMV on non-scalar types [\#1937](https://github.com/kokkos/kokkos-kernels/pull/1937) + - remove triplicate sanity checks in BsrMatrix [\#1923](https://github.com/kokkos/kokkos-kernels/pull/1923) + - remove duplicate BSR SpMV tests [\#1922](https://github.com/kokkos/kokkos-kernels/pull/1922) +- Only deep_copy from device to host if supernodal sptrsv algorithms are used [\#1993](https://github.com/kokkos/kokkos-kernels/pull/1993) +- Improve KokkosSparse_kk_spmv [\#1979](https://github.com/kokkos/kokkos-kernels/pull/1979) + - Add 5 warm-up calls to get accurate, consistent timing + - Print out the matrix dimensions correctly when loading from disk +- sparse/impl: Make PSGS non-blocking [\#1917](https://github.com/kokkos/kokkos-kernels/pull/1917) + +#### ODE +- ODE: changing layout of temp mem in RK algorithms [\#1908](https://github.com/kokkos/kokkos-kernels/pull/1908) +- ODE: adding adaptivity test for RK methods [\#1896](https://github.com/kokkos/kokkos-kernels/pull/1896) + +#### Common utilities +- Common: remove half and bhalf implementations (now in Kokkos Core) [\#1981](https://github.com/kokkos/kokkos-kernels/pull/1981) +- KokkosKernels: switching from printf macro to function [\#1977](https://github.com/kokkos/kokkos-kernels/pull/1977) +- OrdinalTraits: constexpr functions [\#1976](https://github.com/kokkos/kokkos-kernels/pull/1976) +- Parallel prefix sum can infer view type [\#1974](https://github.com/kokkos/kokkos-kernels/pull/1974) + +#### TPL support +- BSPGEMM: removing cusparse testing for version older than 11.4.0 [\#1996](https://github.com/kokkos/kokkos-kernels/pull/1996) +- Revise KokkosBlas::nrm2 TPL implementation [\#1950](https://github.com/kokkos/kokkos-kernels/pull/1950) +- Add TPL oneMKL GEMV support [\#1912](https://github.com/kokkos/kokkos-kernels/pull/1912) +- oneMKL spmv [\#1882](https://github.com/kokkos/kokkos-kernels/pull/1882) + +### Build System: +- CMakeLists.txt: Update Kokkos version to 4.2.99 for version check [\#2003](https://github.com/kokkos/kokkos-kernels/pull/2003) +- CMake: Adding logic to catch bad Kokkos version [\#1990](https://github.com/kokkos/kokkos-kernels/pull/1990) +- Remove calling tribits_exclude_autotools_files() [\#1888](https://github.com/kokkos/kokkos-kernels/pull/1888) + +### Documentation and Testing: +- Update create_gs_handle docs [\#1958](https://github.com/kokkos/kokkos-kernels/pull/1958) +- docs: Add testing table [\#1876](https://github.com/kokkos/kokkos-kernels/pull/1876) +- docs: Note which builds have ETI disabled [\#1934](https://github.com/kokkos/kokkos-kernels/pull/1934) +- Generate HTML docs [\#1921](https://github.com/kokkos/kokkos-kernels/pull/1921) +- github/workflows: Pin sphinx version [\#1948](https://github.com/kokkos/kokkos-kernels/pull/1948) +- github/workflows/docs.yml: Use up-to-date doxygen version [\#1941](https://github.com/kokkos/kokkos-kernels/pull/1941) + +- Unit-Test: adding specific test for block sparse functions [\#1944](https://github.com/kokkos/kokkos-kernels/pull/1944) +- Update SYCL docker image to Cuda 11.7.1 [\#1939](https://github.com/kokkos/kokkos-kernels/pull/1939) +- Remove printouts from the unit tests of ger() and syr() [\#1933](https://github.com/kokkos/kokkos-kernels/pull/1933) +- update testing scripts [\#1960](https://github.com/kokkos/kokkos-kernels/pull/1960) +- Speed up BSR spmv tests [\#1945](https://github.com/kokkos/kokkos-kernels/pull/1945) +- Test_ODE_Newton: Add template parameters for Kokkos::pair [\#1929](https://github.com/kokkos/kokkos-kernels/pull/1929) +- par_ilut: Update documentation for fill_in_limit [\#2001](https://github.com/kokkos/kokkos-kernels/pull/2001) + +### Benchmarks: +- perf_test/sparse: Update GS perf_test for streams [\#1963](https://github.com/kokkos/kokkos-kernels/pull/1963) +- Batched sparse perf_tests: Don't write to source tree during build [\#1904](https://github.com/kokkos/kokkos-kernels/pull/1904) +- ParILUT bench: fix unused IS_GPU warning [\#1900](https://github.com/kokkos/kokkos-kernels/pull/1900) +- BsrMatrix SpMV Google Benchmark [\#1886](https://github.com/kokkos/kokkos-kernels/pull/1886) +- Use extraction timestamps for fetched Google Benchmark files [\#1881](https://github.com/kokkos/kokkos-kernels/pull/1881) +- Improve help text in perf tests [\#1875](https://github.com/kokkos/kokkos-kernels/pull/1875) + +### Cleanup: +- iostream clean-up in benchmarks [\#2004](https://github.com/kokkos/kokkos-kernels/pull/2004) +- Rename TestExecSpace to TestDevice [\#1970](https://github.com/kokkos/kokkos-kernels/pull/1970) +- remove Intel 2017 code (no longer supported) [\#1920](https://github.com/kokkos/kokkos-kernels/pull/1920) +- clean-up implementations for move of HIP outside of experimental [#1999](https://github.com/kokkos/kokkos-kernels/pull/1999) + +### Bug Fixes: +- upstream iostream removal fix [\#1991](https://github.com/kokkos/kokkos-kernels/pull/1991), [\#1995](https://github.com/kokkos/kokkos-kernels/pull/1995) +- Test and fix gemv stream interface [\#1987](https://github.com/kokkos/kokkos-kernels/pull/1987) +- Test_Sparse_spmv_bsr.hpp: Workaround cuda 11.2 compiler error [\#1983](https://github.com/kokkos/kokkos-kernels/pull/1983) +- Fix improper use of execution space instances in ODE tests. Better handling of CudaUVMSpaces during build. [\#1973](https://github.com/kokkos/kokkos-kernels/pull/1973) +- Don't assume the default memory space is used [\#1969](https://github.com/kokkos/kokkos-kernels/pull/1969) +- MDF: set default verbosity explicitly to avoid valgrind warnings [\#1968](https://github.com/kokkos/kokkos-kernels/pull/1968) +- Fix sort_and_merge functions for in-place case [\#1966](https://github.com/kokkos/kokkos-kernels/pull/1966) +- SPMV_Struct_Functor: initialize numExterior to 0 [\#1957](https://github.com/kokkos/kokkos-kernels/pull/1957) +- Use rank-1 impl types when rank-2 vector is dynamically rank 1 [\#1953](https://github.com/kokkos/kokkos-kernels/pull/1953) +- BsrMatrix: Check if CUDA is enabled before checking architecture [\#1955](https://github.com/kokkos/kokkos-kernels/pull/1955) +- Avoid enum without fixed underlying type to fix SYCL [\#1940](https://github.com/kokkos/kokkos-kernels/pull/1940) +- Fix SpAdd perf test when offset/ordinal is not int [\#1928](https://github.com/kokkos/kokkos-kernels/pull/1928) +- Add KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS definition for architectures with independent thread scheduling [\#1927](https://github.com/kokkos/kokkos-kernels/pull/1927) +- Fix cm_generate_makefile --boundscheck [\#1926](https://github.com/kokkos/kokkos-kernels/pull/1926) +- Bsr compatibility [\#1925](https://github.com/kokkos/kokkos-kernels/pull/1925) +- BLAS: fix assignable check in gemv and gemm [\#1914](https://github.com/kokkos/kokkos-kernels/pull/1914) +- mdf: fix initial value in select pivot functor [\#1916](https://github.com/kokkos/kokkos-kernels/pull/1916) +- add missing headers, std::vector -> std::vector<...> [\#1909](https://github.com/kokkos/kokkos-kernels/pull/1909) +- Add missing include to Test_Sparse_MergeMatrix.hpp [\#1907](https://github.com/kokkos/kokkos-kernels/pull/1907) +- Remove non-existant dir from CMake include paths [\#1892](https://github.com/kokkos/kokkos-kernels/pull/1892) +- cusparse 12 spmv: check y vector alignment [\#1889](https://github.com/kokkos/kokkos-kernels/pull/1889) +- Change 'or' to '||' to fix compilation on MSVC [\#1885](https://github.com/kokkos/kokkos-kernels/pull/1885) +- Add missing KokkosKernels_Macros.hpp include [\#1884](https://github.com/kokkos/kokkos-kernels/pull/1884) +- Backward-compatible fix with kokkos@4.0 [\#1874](https://github.com/kokkos/kokkos-kernels/pull/1874) +- Fix for rocblas builds [\#1871](https://github.com/kokkos/kokkos-kernels/pull/1871) +- Correcting 'syr test' bug causing compilation errors with Trilinos [\#1870](https://github.com/kokkos/kokkos-kernels/pull/1870) +- Workaround for spiluk and sptrsv stream tests with OMP_NUM_THREADS of 1, 2, 3 [\#1864](https://github.com/kokkos/kokkos-kernels/pull/1864) +- bhalf_t fix for isnan function [\#2007](https://github.com/kokkos/kokkos-kernels/pull/2007) + + +## [4.1.00](https://github.com/kokkos/kokkos-kernels/tree/4.1.00) (2023-06-16) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.01...4.1.00) + +### New Features + +#### BLAS updates +- Adding interface with execution space instance argument to support execution of BLAS on stream + - Norms on stream [\#1795](https://github.com/kokkos/kokkos-kernels/pull/1795) + - Blas1 on stream [\#1803](https://github.com/kokkos/kokkos-kernels/pull/1803) + - Blas2 and 3 on stream [\#1812](https://github.com/kokkos/kokkos-kernels/pull/1812) +- Improving BLAS level 2 support by adding native implementation and TPL for GER, HER and SYR + - Implementation for BLAS2 ger [\#1756](https://github.com/kokkos/kokkos-kernels/pull/1756) + - Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) + +#### Batched updates +- Optimizing algorithms for single input data + - Add calls to KokkosBlas Dot and Axpy for team batched kernels when m==1 [\#1753](https://github.com/kokkos/kokkos-kernels/pull/1753) + - Add calls to KokkosBlas Gemv and Spmv for team batched kernels when m==1 [\#1770](https://github.com/kokkos/kokkos-kernels/pull/1770) + +#### Sparse updates +- Adding stream support to ILUK/SPTRSV and sort/merge + - Streams interface for SPILUK numeric [\#1728](https://github.com/kokkos/kokkos-kernels/pull/1728) + - Stream interface for SPTRSV solve [\#1820](https://github.com/kokkos/kokkos-kernels/pull/1820) + - Add exec instance support to sort/sort_and_merge utils [\#1744](https://github.com/kokkos/kokkos-kernels/pull/1744) +- Add BsrMatrix SpMV in rocSparse TPL, rewrite BsrMatrix SpMV unit tests [\#1769](https://github.com/kokkos/kokkos-kernels/pull/1769) +- sparse: Add coo2crs, crs2coo and CooMatrix [\#1686](https://github.com/kokkos/kokkos-kernels/pull/1686) +- Adds team- and thread-based lower-bound and upper-bound search and predicates [\#1711](https://github.com/kokkos/kokkos-kernels/pull/1711) +- Adds KokkosKernels::Impl::Iota, a view-like where iota(i) = i + offset [\#1710](https://github.com/kokkos/kokkos-kernels/pull/1710) + +#### Misc updates +- ODE: explicit integration methods [\#1754](https://github.com/kokkos/kokkos-kernels/pull/1754) + +### Enhancements: + +#### BLAS +- refactor blas3 tests to use benchmark library [\#1751](https://github.com/kokkos/kokkos-kernels/pull/1751) + +#### Batched +- batched/eti: ETI host-level interfaces [\#1783](https://github.com/kokkos/kokkos-kernels/pull/1783) +- batched/dense: Add gesv DynRankView runtime checks [\#1850](https://github.com/kokkos/kokkos-kernels/pull/1850) + +#### Sparse +- Add support for complex data types in MDF [\#1776](https://github.com/kokkos/kokkos-kernels/pull/1776) +- Sort and merge improvements [\#1773](https://github.com/kokkos/kokkos-kernels/pull/1773) +- spgemm handle: check that A,B,C graphs never change [\#1742](https://github.com/kokkos/kokkos-kernels/pull/1742) +- Fix/enhance backend issues on spadd perftest [\#1672](https://github.com/kokkos/kokkos-kernels/pull/1672) +- Spgemm perf test enhancements [\#1664](https://github.com/kokkos/kokkos-kernels/pull/1664) +- add explicit tests of opt-in algorithms in SpMV [\#1712](https://github.com/kokkos/kokkos-kernels/pull/1712) + +#### Common utilities +- Added TplsVersion file and print methods [\#1693](https://github.com/kokkos/kokkos-kernels/pull/1693) +- Add basis skeleton for KokkosKernels::print_configuration [\#1665](https://github.com/kokkos/kokkos-kernels/pull/1665) +- Add git information to benchmark context [\#1722](https://github.com/kokkos/kokkos-kernels/pull/1722) +- Test mixed scalars: more fixes related to mixed scalar tests [\#1694](https://github.com/kokkos/kokkos-kernels/pull/1694) +- PERF TESTS: adding utilities and instantiation wrapper [\#1676](https://github.com/kokkos/kokkos-kernels/pull/1676) + +#### TPL support +- Refactor MKL TPL for both CPU and GPU usage [\#1779](https://github.com/kokkos/kokkos-kernels/pull/1779) +- MKL: support indices properly [\#1868](https://github.com/kokkos/kokkos-kernels/pull/1868) +- Use rocsparse_spmv_ex for rocm >= 5.4.0 [\#1701](https://github.com/kokkos/kokkos-kernels/pull/1701) + + +### Build System: +- Do not change memory spaces instantiation defaults based on Kokkos_ENABLE_CUDA_UVM [\#1835](https://github.com/kokkos/kokkos-kernels/pull/1835) +- KokkosKernels: Remove TriBITS Kokkos subpackages (trilinos/Trilinos#11545) [\#1817](https://github.com/kokkos/kokkos-kernels/pull/1817) +- CMakeLists.txt: Add alias to match what is exported from Trilinos [\#1855](https://github.com/kokkos/kokkos-kernels/pull/1855) +- KokkosKernels: Don't list include for non-existant 'batched' build dir (trilinos/Trilinos#11966) [\#1867](https://github.com/kokkos/kokkos-kernels/pull/1867) +- Remove non-existant subdir kokkos-kernels/common/common (#11921, #11863) [\#1854](https://github.com/kokkos/kokkos-kernels/pull/1854) +- KokkosKernels: Remove non-existent common/src/[impl,tpls] include dirs (trilinos/Trilinos#11545) [\#1844](https://github.com/kokkos/kokkos-kernels/pull/1844) + +### Documentation and Testing: +- Enable sphinx werror [\#1856](https://github.com/kokkos/kokkos-kernels/pull/1856) +- Update cmake option naming in docs/comments [\#1849](https://github.com/kokkos/kokkos-kernels/pull/1849) +- docs/developer: Add Experimental namespace [\#1852](https://github.com/kokkos/kokkos-kernels/pull/1852) +- docs: Add profiling for compile times [\#1843](https://github.com/kokkos/kokkos-kernels/pull/1843) +- Ger: adding documentation stubs in apidocs [\#1822](https://github.com/kokkos/kokkos-kernels/pull/1822) +- .github/workflows: Summarize github-DOCS errors and warnings [\#1814](https://github.com/kokkos/kokkos-kernels/pull/1814) +- Blas1: docs update for PR #1803 [\#1805](https://github.com/kokkos/kokkos-kernels/pull/1805) +- apt-get update in hosted runner docs check [\#1797](https://github.com/kokkos/kokkos-kernels/pull/1797) +- scripts: Fix github-DOCS [\#1796](https://github.com/kokkos/kokkos-kernels/pull/1796) +- Add --enable-docs option to cm_generate_makefile [\#1785](https://github.com/kokkos/kokkos-kernels/pull/1785) +- docs: Add stubs for some sparse APIs [\#1768](https://github.com/kokkos/kokkos-kernels/pull/1768) +- .github: Update to actions/checkout@v3 [\#1767](https://github.com/kokkos/kokkos-kernels/pull/1767) +- docs: Include BatchedGemm [\#1765](https://github.com/kokkos/kokkos-kernels/pull/1765) +- .github: Automation reminder [\#1726](https://github.com/kokkos/kokkos-kernels/pull/1726) +- Allow an HTML-only docs build [\#1723](https://github.com/kokkos/kokkos-kernels/pull/1723) +- SYCL CI: Specify the full path to the compiler [\#1670](https://github.com/kokkos/kokkos-kernels/pull/1670) +- Add github DOCS ci check & disable Kokkos tests [\#1647](https://github.com/kokkos/kokkos-kernels/pull/1647) +- Add rocsparse,rocblas, to enabled TPLs in cm_test_all_sandia when --spot-check-tpls [\#1841](https://github.com/kokkos/kokkos-kernels/pull/1841) +- cm_test_all_sandia: update to add caraway queues for MI210, MI250 [\#1840](https://github.com/kokkos/kokkos-kernels/pull/1840) +- Support rocSparse in rocm 5.2.0 [\#1833](https://github.com/kokkos/kokkos-kernels/pull/1833) +- Add KokkosKernels_PullRequest_VEGA908_Tpls_ROCM520 support, only enable KokkosBlas::gesv where supported [\#1816](https://github.com/kokkos/kokkos-kernels/pull/1816) +- scripts: Include OMP settings [\#1801](https://github.com/kokkos/kokkos-kernels/pull/1801) +- Print the patch that clang-format-8 wants to apply [\#1714](https://github.com/kokkos/kokkos-kernels/pull/1714) + +### Benchmarks: +- Benchmark cleanup for par_ilut and spmv [\#1853](https://github.com/kokkos/kokkos-kernels/pull/1853) +- SpMV: adding benchmark for spmv [\#1821](https://github.com/kokkos/kokkos-kernels/pull/1821) +- New performance test for par_ilut, ginkgo::par_ilut, and spill [\#1799](https://github.com/kokkos/kokkos-kernels/pull/1799) +- Include OpenMP environment variables in benchmark context [\#1789](https://github.com/kokkos/kokkos-kernels/pull/1789) +- Re-enable and clean up triangle counting perf test [\#1752](https://github.com/kokkos/kokkos-kernels/pull/1752) +- Include google/benchmark lib version in benchmark output [\#1750](https://github.com/kokkos/kokkos-kernels/pull/1750) +- Refactor blas2 test for benchmark feature [\#1733](https://github.com/kokkos/kokkos-kernels/pull/1733) +- Adds a better parilut test with gmres [\#1661](https://github.com/kokkos/kokkos-kernels/pull/1661) +- Refactor blas1 test for benchmark feature [\#1636](https://github.com/kokkos/kokkos-kernels/pull/1636) + +### Cleanup: +- Drop outdated workarounds for backward compatibility with Kokkos [\#1836](https://github.com/kokkos/kokkos-kernels/pull/1836) +- Remove dead code guarded [\#1834](https://github.com/kokkos/kokkos-kernels/pull/1834) +- Remove decl ETI files [\#1824](https://github.com/kokkos/kokkos-kernels/pull/1824) +- Reorganize par_ilut performance test [\#1818](https://github.com/kokkos/kokkos-kernels/pull/1818) +- Deprecate Kokkos::Details::ArithTraits [\#1748](https://github.com/kokkos/kokkos-kernels/pull/1748) +- Drop obsolete workaround #ifdef KOKKOS_IF_ON_HOST [\#1720](https://github.com/kokkos/kokkos-kernels/pull/1720) +- Drop pre Kokkos 3.6 workaround [\#1653](https://github.com/kokkos/kokkos-kernels/pull/1653) +- View::Rank -> View::rank [\#1703](https://github.com/kokkos/kokkos-kernels/pull/1703) +- Prefer Kokkos::View::{R->r}ank [\#1679](https://github.com/kokkos/kokkos-kernels/pull/1679) +- Call concurrency(), not impl_thread_pool_size() [\#1666](https://github.com/kokkos/kokkos-kernels/pull/1666) +- Kokkos moves ALL_t out of Impl namespace [\#1658](https://github.com/kokkos/kokkos-kernels/pull/1658) +- Add KokkosKernels::Impl::are_integral_v helper variable template and quit using Kokkos::Impl::are_integral trait [\#1652](https://github.com/kokkos/kokkos-kernels/pull/1652) + +### Bug Fixes: +- Kokkos 4 compatibility: modifying the preprocessor logic [\#1827](https://github.com/kokkos/kokkos-kernels/pull/1827) +- blas/tpls: Fix gemm include guard typo [\#1848](https://github.com/kokkos/kokkos-kernels/pull/1848) +- spmv cusparse version check modified for cuda/11.1 [\#1828](https://github.com/kokkos/kokkos-kernels/pull/1828) +- Workaround for #1777 - cusparse spgemm test hang [\#1811](https://github.com/kokkos/kokkos-kernels/pull/1811) +- Fix 1798 [\#1800](https://github.com/kokkos/kokkos-kernels/pull/1800) +- BLAS: fixes and testing for LayoutStride [\#1794](https://github.com/kokkos/kokkos-kernels/pull/1794) +- Fix 1786: check that work array is contiguous in SVD [\#1793](https://github.com/kokkos/kokkos-kernels/pull/1793) +- Fix unused variable warnings [\#1790](https://github.com/kokkos/kokkos-kernels/pull/1790) +- Use KOKKOS_IMPL_DO_NOT_USE_PRINTF in Test_Common_UpperBound.hpp [\#1784](https://github.com/kokkos/kokkos-kernels/pull/1784) +- Batched Gesv: initializing variable to make compiler happy [\#1778](https://github.com/kokkos/kokkos-kernels/pull/1778) +- perf test utils: fix device ID parsing [\#1739](https://github.com/kokkos/kokkos-kernels/pull/1739) +- Fix OOB and improve comments in BsrMatrix COO constructor [\#1732](https://github.com/kokkos/kokkos-kernels/pull/1732) +- batched/unit_test: Disable simd dcomplex4 test in for intel > 19.05 and <= 2021. [\#1857](https://github.com/kokkos/kokkos-kernels/pull/1857) +- rocsparse spmv tpl: Fix rocsparse_spmv call for rocm < 5.4.0 [\#1716](https://github.com/kokkos/kokkos-kernels/pull/1716) +- compatibility with 4.0.0 [\#1709](https://github.com/kokkos/kokkos-kernels/pull/1709) +- team mult: fix type issue in max_error calculation [\#1706](https://github.com/kokkos/kokkos-kernels/pull/1706) +- cast Kokkos::Impl::integral_constant to int [\#1697](https://github.com/kokkos/kokkos-kernels/pull/1697) + + ## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-04-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.00...4.0.01)